package org.languagetool.tokenizers.ca;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.languagetool.tagging.ca.CatalanTagger;
import org.languagetool.tokenizers.WordTokenizer;

/* loaded from: input_file:META-INF/jars/language-ca-6.4.jar:org/languagetool/tokenizers/ca/CatalanWordTokenizer.class */
public class CatalanWordTokenizer extends WordTokenizer {
    private static final String wordCharacters = "§©@€£\\$_\\p{L}\\d·\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬";
    private static final String PF = "(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)";
    private static final int maxPatterns = 11;
    private final Pattern[] patterns = new Pattern[11];
    private static final Pattern tokenizerPattern = Pattern.compile("[§©@€£\\$_\\p{L}\\d·\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬]+|[^§©@€£\\$_\\p{L}\\d·\\-̀-ͯ¨⁰-\u209f°%‰‱&�\u00ad¬]");
    private static final Pattern PATTERN_1 = Pattern.compile("xxCA_APOS_RECTExx", 16);
    private static final Pattern PATTERN_2 = Pattern.compile("xxCA_APOS_RODOxx", 16);
    private static final Pattern PATTERN_3 = Pattern.compile("xxCA_HYPHENxx", 16);
    private static final Pattern PATTERN_4 = Pattern.compile("xxCA_DECIMALPOINTxx", 16);
    private static final Pattern PATTERN_5 = Pattern.compile("xxCA_DECIMALCOMMAxx", 16);
    private static final Pattern PATTERN_6 = Pattern.compile("xxCA_SPACExx", 16);
    private static final Pattern PATTERN_7 = Pattern.compile("xxELA_GEMINADAxx", 16);
    private static final Pattern PATTERN_8 = Pattern.compile("xxELA_GEMINADA_UPPERCASExx", 16);
    private static final Pattern SOFT_HYPHEN = Pattern.compile("\u00ad");
    private static final Pattern CURLY_SINGLE_QUOTE = Pattern.compile("’", 16);
    private static final Pattern LL = Pattern.compile("l-l", 16);
    private static final Pattern ELA_GEMINADA = Pattern.compile("([aeiouàéèíóòúïüAEIOUÀÈÉÍÒÓÚÏÜ])l[.•⋅∙\uf0d7]l([aeiouàéèíóòúïü])", 64);
    private static final Pattern ELA_GEMINADA_UPPERCASE = Pattern.compile("([AEIOUÀÈÉÍÒÓÚÏÜ])L[.•⋅∙\uf0d7]L([AEIOUÀÈÉÍÒÓÚÏÜ])", 64);
    private static final Pattern APOSTROF_RECTE = Pattern.compile("([\\p{L}])'([\\p{L}\"‘“«])", 66);
    private static final Pattern APOSTROF_RODO = Pattern.compile("([\\p{L}])’([\\p{L}\"‘“«])", 66);
    private static final Pattern APOSTROF_RECTE_1 = Pattern.compile("([dlDL])'(\\d[\\d\\s\\.,]?)", 66);
    private static final Pattern APOSTROF_RODO_1 = Pattern.compile("([dlDL])’(\\d[\\d\\s\\.,]?)", 66);
    private static final Pattern DECIMAL_POINT = Pattern.compile("([\\d])\\.([\\d])", 66);
    private static final Pattern DECIMAL_COMMA = Pattern.compile("([\\d]),([\\d])", 66);
    private static final Pattern SPACE_DIGITS0 = Pattern.compile("([\\d]{4}) ", 66);
    private static final Pattern SPACE_DIGITS = Pattern.compile("([\\d]) ([\\d][\\d][\\d])", 66);
    private static final Pattern SPACE_DIGITS2 = Pattern.compile("([\\d]) ([\\d][\\d][\\d]) ([\\d][\\d][\\d])", 66);
    private static final Pattern SPACE0 = Pattern.compile("xxCA_SPACE0xx");
    private static final Pattern HYPHEN_L = Pattern.compile("([\\p{L}]+)(-)([Ll]['’])([\\p{L}]+)", 66);

    public CatalanWordTokenizer() {
        this.patterns[0] = Pattern.compile("^([lnmtsd]['’])([^'’\\-]*)$", 66);
        this.patterns[1] = Pattern.compile("^(qui-sap-lo|qui-sap-la|qui-sap-los|qui-sap-les)|(Castella)(-)(la)$", 66);
        this.patterns[2] = Pattern.compile("^([lnmtsd]['’])(.{2,})(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[3] = Pattern.compile("^(.{2,})(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[4] = Pattern.compile("^([lnmtsd]['’])(.{2,})(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[5] = Pattern.compile("^(.{2,})(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[6] = Pattern.compile("^([lnmtsd]['’])(.{2,})(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[7] = Pattern.compile("^(.+[^wo])(['’]en|['’]hi|['’]ho|['’]l|['’]ls|['’]m|['’]n|['’]ns|['’]s|['’]t|-el|-els|-em|-en|-ens|-hi|-ho|-l|-la|-les|-li|-lo|-los|-m|-me|-n|-ne|-nos|-s|-se|-t|-te|-us|-vos)$", 66);
        this.patterns[8] = Pattern.compile("^([lnmtsd]['’])(.*)$", 66);
        this.patterns[9] = Pattern.compile("^(a|de|pe)(ls?)$", 66);
        this.patterns[10] = Pattern.compile("^(ca)(n)$", 66);
    }

    @Override // org.languagetool.tokenizers.WordTokenizer, org.languagetool.tokenizers.Tokenizer
    public List<String> tokenize(String str) {
        ArrayList arrayList = new ArrayList();
        String replaceAll = SPACE_DIGITS.matcher(SPACE_DIGITS2.matcher(SPACE_DIGITS0.matcher(DECIMAL_COMMA.matcher(DECIMAL_POINT.matcher(APOSTROF_RODO_1.matcher(APOSTROF_RODO.matcher(APOSTROF_RECTE_1.matcher(APOSTROF_RECTE.matcher(ELA_GEMINADA_UPPERCASE.matcher(ELA_GEMINADA.matcher(str.replace((char) 8208, '-').replace((char) 8209, '-')).replaceAll("$1xxELA_GEMINADAxx$2")).replaceAll("$1xxELA_GEMINADA_UPPERCASExx$2")).replaceAll("$1xxCA_APOS_RECTExx$2")).replaceAll("$1xxCA_APOS_RECTExx$2")).replaceAll("$1xxCA_APOS_RODOxx$2")).replaceAll("$1xxCA_APOS_RODOxx$2")).replaceAll("$1xxCA_DECIMALPOINTxx$2")).replaceAll("$1xxCA_DECIMALCOMMAxx$2")).replaceAll("$1xxCA_SPACE0xx")).replaceAll("$1xxCA_SPACExx$2xxCA_SPACExx$3")).replaceAll("$1xxCA_SPACExx$2");
        Matcher matcher = SPACE0.matcher(replaceAll);
        Matcher matcher2 = tokenizerPattern.matcher(SPACE0.matcher(replaceAll).replaceAll(" "));
        while (matcher2.find()) {
            String group = matcher2.group();
            if (arrayList.size() <= 0 || group.length() != 1 || group.codePointAt(0) < 65024 || group.codePointAt(0) > 65039) {
                String replaceAll2 = PATTERN_8.matcher(PATTERN_7.matcher(PATTERN_6.matcher(PATTERN_5.matcher(PATTERN_4.matcher(PATTERN_3.matcher(PATTERN_2.matcher(PATTERN_1.matcher(group).replaceAll("'")).replaceAll("’")).replaceAll("-")).replaceAll(".")).replaceAll(",")).replaceAll(" ")).replaceAll("l.l")).replaceAll("L.L");
                boolean z = false;
                while (replaceAll2.length() > 1 && replaceAll2.startsWith("-")) {
                    arrayList.add("-");
                    replaceAll2 = replaceAll2.substring(1);
                }
                int i = 0;
                while (replaceAll2.length() > 1 && replaceAll2.endsWith("-")) {
                    replaceAll2 = replaceAll2.substring(0, replaceAll2.length() - 1);
                    i++;
                }
                for (int i2 = 0; i2 < 11 && !z; i2++) {
                    matcher = this.patterns[i2].matcher(replaceAll2);
                    z = matcher.find();
                }
                if (z) {
                    for (int i3 = 1; i3 <= matcher.groupCount(); i3++) {
                        String group2 = matcher.group(i3);
                        if (group2 != null) {
                            arrayList.addAll(wordsToAdd(group2));
                        }
                    }
                } else {
                    arrayList.addAll(wordsToAdd(replaceAll2));
                }
                while (i > 0) {
                    arrayList.add("-");
                    i--;
                }
            } else {
                arrayList.set(arrayList.size() - 1, arrayList.get(arrayList.size() - 1) + group);
            }
        }
        return joinEMailsAndUrls(arrayList);
    }

    private List<String> wordsToAdd(String str) {
        ArrayList arrayList = new ArrayList();
        synchronized (this) {
            if (!str.isEmpty()) {
                if (!str.contains("-") && !str.endsWith("'") && !str.endsWith("’")) {
                    arrayList.add(str);
                } else if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(CURLY_SINGLE_QUOTE.matcher(SOFT_HYPHEN.matcher(str).replaceAll("")).replaceAll("'"))).get(0).isTagged()) {
                    arrayList.add(str);
                } else if (str.equalsIgnoreCase("mers-cov") || str.equalsIgnoreCase("mcgraw-hill") || str.equalsIgnoreCase("sars-cov-2") || str.equalsIgnoreCase("sars-cov") || str.equalsIgnoreCase("ph-metre") || str.equalsIgnoreCase("ph-metres")) {
                    arrayList.add(str);
                } else if (CatalanTagger.INSTANCE_CAT.tag(Arrays.asList(LL.matcher(SOFT_HYPHEN.matcher(str).replaceAll("")).replaceAll("l·l"))).get(0).isTagged()) {
                    arrayList.add(str);
                } else if ((str.endsWith("'") || str.endsWith("’")) && str.length() > 1) {
                    arrayList.addAll(wordsToAdd(str.substring(0, str.length() - 1)));
                    arrayList.add(str.substring(str.length() - 1));
                } else {
                    Matcher matcher = HYPHEN_L.matcher(str);
                    if (matcher.matches()) {
                        for (int i = 1; i <= matcher.groupCount(); i++) {
                            arrayList.addAll(wordsToAdd(matcher.group(i)));
                        }
                    } else {
                        StringTokenizer stringTokenizer = new StringTokenizer(str, "-", true);
                        while (stringTokenizer.hasMoreElements()) {
                            arrayList.add(stringTokenizer.nextToken());
                        }
                    }
                }
            }
        }
        return arrayList;
    }
}
