package org.languagetool.chunking;

import edu.washington.cs.knowitall.regex.Match;
import edu.washington.cs.knowitall.regex.RegularExpression;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import opennlp.tools.parser.Parse;
import org.languagetool.AnalyzedTokenReadings;
import org.languagetool.rules.patterns.StringMatcher;

/* loaded from: input_file:META-INF/jars/language-de-6.4.jar:org/languagetool/chunking/GermanChunker.class */
public class GermanChunker implements Chunker {
    private static final Set<String> FILTER_TAGS = new HashSet(Arrays.asList("PP", "NPP", "NPS"));
    private static final TokenExpressionFactory FACTORY = new TokenExpressionFactory(false);
    private static final Pattern simpleFormRegexp = Pattern.compile("(^| )<([a-zäöüß|()\\[\\]?,]+)>\\+?( |$)", 66);
    private static final Map<String, String> SYNTAX_EXPANSION = new HashMap();
    private static boolean debug;
    private static final String[][] undOderBzw;
    private static final List<RegularExpressionWithPhraseType> REGEXES1;
    private static final List<RegularExpressionWithPhraseType> REGEXES2;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:META-INF/jars/language-de-6.4.jar:org/languagetool/chunking/GermanChunker$AffectedSpans.class */
    public static class AffectedSpans {
        final List<Span> spans;

        AffectedSpans(List<Span> list) {
            this.spans = list;
        }

        boolean isAffected(int i) {
            for (Span span : this.spans) {
                if (i >= span.startIndex && i < span.endIndex) {
                    return true;
                }
            }
            return false;
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    /* loaded from: input_file:META-INF/jars/language-de-6.4.jar:org/languagetool/chunking/GermanChunker$PhraseType.class */
    public enum PhraseType {
        NP,
        NPS,
        NPP,
        PP
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:META-INF/jars/language-de-6.4.jar:org/languagetool/chunking/GermanChunker$RegularExpressionWithPhraseType.class */
    public static class RegularExpressionWithPhraseType {
        final RegularExpression<ChunkTaggedToken> expression;
        final PhraseType phraseType;
        final boolean overwrite;
        final String[][] formHints;

        RegularExpressionWithPhraseType(RegularExpression<ChunkTaggedToken> regularExpression, PhraseType phraseType, boolean z, String[][] strArr) {
            this.expression = regularExpression;
            this.phraseType = phraseType;
            this.overwrite = z;
            this.formHints = strArr;
        }

        public String toString() {
            return this.phraseType + " <= " + this.expression + " (overwrite: " + this.overwrite + Parse.BRACKET_RRB;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:META-INF/jars/language-de-6.4.jar:org/languagetool/chunking/GermanChunker$Span.class */
    public static class Span {
        final int startIndex;
        final int endIndex;

        Span(int i, int i2) {
            this.startIndex = i;
            this.endIndex = i2;
        }
    }

    public static void setDebug(boolean z) {
        debug = z;
    }

    public static boolean isDebug() {
        return debug;
    }

    private static RegularExpressionWithPhraseType build(String str, PhraseType phraseType) {
        return build(str, phraseType, false);
    }

    private static RegularExpressionWithPhraseType build(String str, PhraseType phraseType, boolean z) {
        String str2 = str;
        for (Map.Entry<String, String> entry : SYNTAX_EXPANSION.entrySet()) {
            str2 = str2.replace(entry.getKey(), entry.getValue());
        }
        return buildExpanded(str2, phraseType, z, calcFormHints(str2));
    }

    private static RegularExpressionWithPhraseType buildExpanded(String str, PhraseType phraseType, boolean z, String[][] strArr) {
        return new RegularExpressionWithPhraseType(RegularExpression.compile(str, FACTORY), phraseType, z, strArr);
    }

    private static String[][] calcFormHints(String str) {
        ArrayList arrayList = new ArrayList();
        Matcher matcher = simpleFormRegexp.matcher(str);
        while (matcher.find()) {
            Set<String> possibleValues = StringMatcher.create(matcher.group(2), true, false).getPossibleValues();
            if (possibleValues != null) {
                arrayList.add(possibleValues.toArray(new String[0]));
            }
        }
        return (String[][]) arrayList.toArray(new String[0]);
    }

    @Override // org.languagetool.chunking.Chunker
    public void addChunkTags(List<AnalyzedTokenReadings> list) {
        Set<String> allForms = allForms(list);
        List<ChunkTaggedToken> basicChunks = getBasicChunks(list, allForms);
        Iterator<RegularExpressionWithPhraseType> it = REGEXES2.iterator();
        while (it.hasNext()) {
            apply(it.next(), basicChunks, allForms);
        }
        assignChunksToReadings(basicChunks);
    }

    List<ChunkTaggedToken> getBasicChunks(List<AnalyzedTokenReadings> list) {
        return getBasicChunks(list, allForms(list));
    }

    private List<ChunkTaggedToken> getBasicChunks(List<AnalyzedTokenReadings> list, Set<String> set) {
        ArrayList arrayList = new ArrayList();
        for (AnalyzedTokenReadings analyzedTokenReadings : list) {
            if (!analyzedTokenReadings.isWhitespace()) {
                arrayList.add(new ChunkTaggedToken(analyzedTokenReadings.getToken(), Collections.singletonList(new ChunkTag("O")), analyzedTokenReadings));
            }
        }
        if (debug) {
            System.out.println("=============== CHUNKER INPUT ===============");
            System.out.println(getDebugString(arrayList));
        }
        Iterator<RegularExpressionWithPhraseType> it = REGEXES1.iterator();
        while (it.hasNext()) {
            apply(it.next(), arrayList, set);
        }
        return arrayList;
    }

    private void apply(RegularExpressionWithPhraseType regularExpressionWithPhraseType, List<ChunkTaggedToken> list, Set<String> set) {
        if (hasAllFormHints(regularExpressionWithPhraseType, set)) {
            String debugString = getDebugString(list);
            try {
                AffectedSpans doApplyRegex = doApplyRegex(regularExpressionWithPhraseType, list);
                String debugString2 = getDebugString(list);
                if (!debugString2.equals(debugString)) {
                    printDebugInfo(regularExpressionWithPhraseType, doApplyRegex, debugString2);
                }
            } catch (Exception e) {
                throw new RuntimeException("Could not apply chunk regexp '" + regularExpressionWithPhraseType + "' to tokens: " + list, e);
            }
        }
    }

    private static boolean hasAllFormHints(RegularExpressionWithPhraseType regularExpressionWithPhraseType, Set<String> set) {
        for (String[] strArr : regularExpressionWithPhraseType.formHints) {
            if (!hasForm(set, strArr)) {
                return false;
            }
        }
        return true;
    }

    private static boolean hasForm(Set<String> set, String[] strArr) {
        for (String str : strArr) {
            if (set.contains(str)) {
                return true;
            }
        }
        return false;
    }

    private static Set<String> allForms(List<AnalyzedTokenReadings> list) {
        TreeSet treeSet = new TreeSet(String.CASE_INSENSITIVE_ORDER);
        Iterator<AnalyzedTokenReadings> it = list.iterator();
        while (it.hasNext()) {
            treeSet.add(it.next().getToken());
        }
        return treeSet;
    }

    private void assignChunksToReadings(List<ChunkTaggedToken> list) {
        for (ChunkTaggedToken chunkTaggedToken : list) {
            AnalyzedTokenReadings readings = chunkTaggedToken.getReadings();
            if (readings != null) {
                readings.setChunkTags(chunkTaggedToken.getChunkTags());
            }
        }
    }

    private AffectedSpans doApplyRegex(RegularExpressionWithPhraseType regularExpressionWithPhraseType, List<ChunkTaggedToken> list) {
        List<Match<ChunkTaggedToken>> findAll = regularExpressionWithPhraseType.expression.findAll(list);
        ArrayList arrayList = new ArrayList();
        for (Match<ChunkTaggedToken> match : findAll) {
            arrayList.add(new Span(match.startIndex(), match.endIndex()));
            for (int startIndex = match.startIndex(); startIndex < match.endIndex(); startIndex++) {
                ChunkTaggedToken chunkTaggedToken = list.get(startIndex);
                ArrayList<ChunkTag> arrayList2 = new ArrayList();
                arrayList2.addAll(chunkTaggedToken.getChunkTags());
                if (regularExpressionWithPhraseType.overwrite) {
                    ArrayList arrayList3 = new ArrayList();
                    for (ChunkTag chunkTag : arrayList2) {
                        if (!FILTER_TAGS.contains(chunkTag.getChunkTag())) {
                            arrayList3.add(chunkTag);
                        }
                    }
                    arrayList2 = arrayList3;
                }
                ChunkTag chunkTag2 = getChunkTag(regularExpressionWithPhraseType, match, startIndex);
                if (!arrayList2.contains(chunkTag2)) {
                    arrayList2.add(chunkTag2);
                    arrayList2.remove(new ChunkTag("O"));
                }
                list.set(startIndex, new ChunkTaggedToken(chunkTaggedToken.getToken(), arrayList2, chunkTaggedToken.getReadings()));
            }
        }
        return new AffectedSpans(arrayList);
    }

    private ChunkTag getChunkTag(RegularExpressionWithPhraseType regularExpressionWithPhraseType, Match<ChunkTaggedToken> match, int i) {
        return regularExpressionWithPhraseType.phraseType == PhraseType.NP ? i == match.startIndex() ? new ChunkTag("B-NP") : new ChunkTag("I-NP") : new ChunkTag(regularExpressionWithPhraseType.phraseType.name());
    }

    private void printDebugInfo(RegularExpressionWithPhraseType regularExpressionWithPhraseType, AffectedSpans affectedSpans, String str) {
        System.out.println("=== Applied " + regularExpressionWithPhraseType + " ===");
        if (regularExpressionWithPhraseType.overwrite) {
            System.out.println("Note: overwrite mode, replacing old " + FILTER_TAGS + " tags");
        }
        int i = 0;
        for (String str2 : str.split("\n")) {
            if (affectedSpans.isAffected(i)) {
                System.out.println(str2.replaceFirst("^  ", " *"));
            } else {
                System.out.println(str2);
            }
            i++;
        }
        System.out.println();
    }

    private String getDebugString(List<ChunkTaggedToken> list) {
        if (!debug) {
            return "";
        }
        StringBuilder sb = new StringBuilder();
        for (ChunkTaggedToken chunkTaggedToken : list) {
            sb.append("  ").append(chunkTaggedToken).append(" -- ").append(chunkTaggedToken.getReadings().toString().replaceFirst(Pattern.quote(chunkTaggedToken.getToken()) + "\\[", Parse.BRACKET_LSB)).append('\n');
        }
        return sb.toString();
    }

    /* JADX WARN: Type inference failed for: r0v11, types: [java.lang.String[], java.lang.String[][]] */
    static {
        SYNTAX_EXPANSION.put("<NP>", "<chunk=B-NP> <chunk=I-NP>*");
        SYNTAX_EXPANSION.put("&prozent;", "Prozent|Kilo|Kilogramm|Gramm|Euro|Pfund");
        debug = false;
        undOderBzw = new String[]{new String[]{"und", "oder", "bzw"}};
        REGEXES1 = Arrays.asList(build("(<posre=^ART.*>|<pos=PRO>)? <pos=ADV>* <pos=PA2>* <pos=ADJ>* <pos=SUB>+", PhraseType.NP), buildExpanded("<pos=SUB> (<und|oder>|(<bzw> <.>)) <pos=SUB>", PhraseType.NP, false, undOderBzw), buildExpanded("<pos=ADJ> (<und|oder>|(<bzw> <.>)) <pos=PA2> <pos=SUB>", PhraseType.NP, false, undOderBzw), buildExpanded("<pos=ADJ> (<und|oder>|(<bzw> <.>)) <pos=ADJ> <pos=SUB>", PhraseType.NP, false, undOderBzw), build("<posre=^ART.*> <pos=ADV>* <pos=ADJ>* <regexCS=[A-ZÖÄÜ][a-zöäü]+>", PhraseType.NP), build("<pos=PRO>? <pos=ZAL> <pos=SUB>", PhraseType.NP), build("<Herr|Herrn|Frau> <pos=EIG>+", PhraseType.NP), build("<Herr|Herrn|Frau> <regexCS=[A-ZÖÄÜ][a-zöäü-]+>+", PhraseType.NP), build("<der>", PhraseType.NP));
        REGEXES2 = Arrays.asList(build("<pos=ADJ> <,> <chunk=B-NP> <chunk=I-NP>* <und|sowie> <NP>", PhraseType.NPP), build("<chunk=B-NP & !regex=jede[rs]?> <chunk=I-NP>* <und|sowie> <pos=ADV>? <NP>", PhraseType.NPP), build("<pos=ADJ> <und|sowie> <chunk=B-NP & !pos=PLU> <chunk=I-NP>*", PhraseType.NPS, true), build("<deren> <chunk=B-NP & !pos=PLU> <und|sowie> <chunk=B-NP>*", PhraseType.NPS, true), build("<pos=EIG> <und> <pos=EIG>", PhraseType.NPP), build("<pos=ART> <pos=ADJ> <und|sowie> (<pos=ADJ>|<pos=PA2>) <chunk=I-NP & !pos=PLU>+", PhraseType.NPS, true), build("<chunk=B-NP & !pos=PLU> <chunk=I-NP>* <und|sowie> <keine> <chunk=I-NP>+", PhraseType.NPS, true), build("<NP> <und|sowie> <pos=ART> <pos=PA1> <pos=SUB>", PhraseType.NPP, true), build("<eins|eines> <chunk=B-NP> <chunk=I-NP>+", PhraseType.NPS), build("<ich|du|er|sie|es|wir|ihr|sie> <und|oder|sowie> <NP>", PhraseType.NPP), build("<sowohl> <NP> <als> <auch> <NP>", PhraseType.NPP), build("<sowohl> <pos=EIG> <als> <auch> <pos=EIG>", PhraseType.NPP), build("<sowohl> <ich|du|er|sie|es|wir|ihr|sie> <als> <auch> <NP>", PhraseType.NPP), build("<pos=SUB> <und|oder|sowie> <chunk=B-NP & !ihre> <chunk=I-NP>*", PhraseType.NPP), build("<weder> <pos=SUB> <noch> <pos=SUB>", PhraseType.NPP), build("<zwei|drei|vier|fünf|sechs|sieben|acht|neun|zehn|elf|zwölf> <chunk=I-NP>", PhraseType.NPP), build("<chunk=B-NP> <pos=PRP> <NP> <chunk=B-NP & pos=SIN> <chunk=I-NP>*", PhraseType.NPS), build("<chunk=B-NP> <pos=PRP> <NP> <chunk=B-NP & pos=PLU> <chunk=I-NP>*", PhraseType.NPP), build("<chunk=B-NP> <pos=PRP> <NP> <pos=PA2> <chunk=B-NP & !pos=PLU> <chunk=I-NP>*", PhraseType.NPS), build("<chunk=B-NP> <pos=PRP> <NP> <pos=PA2> <chunk=B-NP & !pos=SIN> <chunk=I-NP>*", PhraseType.NPP), build("<Herr|Frau> <und> <Herr|Frau> <pos=EIG>*", PhraseType.NPP), build("<chunk=B-NP & !pos=ZAL & !pos=PLU & !chunk=NPP & !einige & !(regex=&prozent;)> <chunk=I-NP & !pos=PLU & !und>*", PhraseType.NPS), build("<chunk=B-NP & !pos=SIN & !chunk=NPS & !Ellen> <chunk=I-NP & !pos=SIN>*", PhraseType.NPP), build("<chunk=NPS> <pos=PRO> <pos=ADJ> <pos=ADJ> <NP>", PhraseType.NPS), build("<regex=eine[rs]?> <der> <am> <pos=ADJ> <pos=PA2> <NP>", PhraseType.NPS), build("<regex=eine[rs]?> <der> <beiden> <pos=ADJ>* <pos=SUB>", PhraseType.NPS), build("<regex=eine[rs]?> <seiner|ihrer> <pos=PA1> <pos=SUB>", PhraseType.NPS), build("<regex=[\\d,.]+> <&prozent;>", PhraseType.NPS), build("<regex=[\\d,.]+> <&prozent;>", PhraseType.NPP), build("<dass> <sie> <wie> <NP>", PhraseType.NPP), build("<pos=PLU> <die> <Regel>", PhraseType.NPP), build("<chunk=B-NP & pos=SIN> <chunk=I-NP & pos=SIN>* <,> <die> <pos=ADV>+ <chunk=NPS>+", PhraseType.NPS), build("<chunk=B-NP & pos=PLU> <chunk=I-NP & pos=PLU>* <,> <die> <pos=ADV>+ <chunk=NPS>+", PhraseType.NPP), build("<der|die|das> <pos=ADJ> <der> <pos=PA1> <pos=SUB>", PhraseType.NPS), build("<pos=SUB & pos=PLU> <der> <pos=PA1> <pos=SUB>", PhraseType.NPP), build("<der|die|das> <pos=ADJ> <der> <pos=PRO>? <pos=SUB>", PhraseType.NPS), build("<chunk=NPS & !einige> <chunk=NPP & (pos=GEN |pos=ZAL)>+", PhraseType.NPS, true), build("<chunk=NPP> <chunk=NPS & pos=GEN>+", PhraseType.NPP, true), build("<chunk=NPS>+ <und> <chunk=NP[SP] & (pos=GEN | pos=ADV)>+", PhraseType.NPS, true), build("<chunk=NPS>+ <der> <pos=ADV> <pos=PA2> <chunk=I-NP>", PhraseType.NPS, true), build("<chunk=NPS>+ <der> (<pos=ADJ>|<pos=ZAL>) <NP>", PhraseType.NPS, true), build("<chunk=NPS>+ <der> <NP>", PhraseType.NPS, true), build("<chunk=NPS>+ <der> <pos=ADJ> <pos=ADV> <pos=PA2> <NP>", PhraseType.NPS, true), build("<chunk=NPS>+ <pos=PRO:POS> <pos=ADJ> <NP>", PhraseType.NPS, true), build("<der|das> <pos=ADJ> <der> <pos=ZAL> <NP>", PhraseType.NPS, true), build("<eine> <menge> <NP>+", PhraseType.NPP, true), build("<er|sie|es> <und> <NP> <NP>", PhraseType.NPP), build("<laut> <regex=.*>{0,3} <Quellen>", PhraseType.PP, true), build("<pos=PRP> <pos=ART:> <pos=ADV>* <pos=ADJ> <NP>", PhraseType.PP, true), build("<pos=PRP> <chunk=NPP>+ <,> <NP>", PhraseType.PP, true), build("<pos=PRP> <chunk=NPP>+", PhraseType.PP, true), build("<pos=PRP> <der> <chunk=NPP>+", PhraseType.PP), build("<pos=PRP> <NP>", PhraseType.PP), build("<pos=PRP> <NP> <pos=ADJ> <und|oder|bzw.> <NP>", PhraseType.PP), build("<pos=PRP> (<NP>)+", PhraseType.PP), build("<pos=PRP> <chunk=B-NP> <pos=ADV> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADV> <pos=ZAL> <chunk=B-NP>", PhraseType.PP), build("<pos=PRP> <pos=PRO> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADJ> <und|oder|sowie> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADV> <regex=\\d+> <NP>", PhraseType.PP), build("<pos=PRP> <pos=PA1> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADJ> <pos=PA1> <NP>", PhraseType.PP), build("<pos=PRP> <NP> <NP> <und|oder> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADV> <pos=ADJ> <NP>", PhraseType.PP), build("<pos=PRP> <pos=ADJ:PRD:GRU> <pos=ZAL> <NP>", PhraseType.PP), build("<die> <pos=ADJ> <Sekunden|Minuten|Stunden|Tage|Wochen|Monate|Jahre|Jahrzehnte|Jahrhunderte> (<NP>)?", PhraseType.PP), build("<die> <pos=ADJ> <pos=ZAL> <Sekunden|Minuten|Stunden|Tage|Wochen|Monate|Jahre|Jahrzehnte|Jahrhunderte> (<NP>)?", PhraseType.PP), build("<regex=(vor)?letzte[sn]?> <Woche|Monat|Jahr|Jahrzehnt|Jahrhundert>", PhraseType.PP), build("<für> <in> <pos=EIG> <pos=PA1> <pos=SUB> <und> <pos=SUB>", PhraseType.PP, true), build("<chunk=NPP> <zwischen> <pos=EIG> <und|sowie> <NP>", PhraseType.NPP), build("<,> <die|welche> <NP> <chunk=NPS & pos=GEN>+", PhraseType.NPP), build("<NP> <,> <NP> <,> <NP>", PhraseType.NPP), build("<NP> <,> <NP> <,> <wie> <auch> <chunk=NPS>+", PhraseType.NPP));
    }
}
