package org.languagetool.language.identifier;

import com.google.protobuf.DescriptorProtos;
import com.optimaize.langdetect.LanguageDetector;
import com.optimaize.langdetect.LanguageDetectorBuilder;
import com.optimaize.langdetect.ngram.NgramExtractors;
import com.optimaize.langdetect.profiles.LanguageProfile;
import com.optimaize.langdetect.profiles.LanguageProfileReader;
import com.optimaize.langdetect.text.RemoveMinorityScriptsTextFilter;
import com.optimaize.langdetect.text.TextObjectFactory;
import com.optimaize.langdetect.text.TextObjectFactoryBuilder;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.concurrent.atomic.AtomicInteger;
import opennlp.tools.parser.Parse;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.languagetool.DetectedLanguage;
import org.languagetool.JLanguageTool;
import org.languagetool.Language;
import org.languagetool.Languages;
import org.languagetool.language.identifier.LanguageIdentifier;
import org.languagetool.language.identifier.detector.FastTextDetector;
import org.languagetool.language.identifier.detector.NGramDetector;
import org.languagetool.noop.NoopLanguage;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:META-INF/jars/languagetool-core-6.4.jar:org/languagetool/language/identifier/DefaultLanguageIdentifier.class */
public class DefaultLanguageIdentifier extends LanguageIdentifier {
    private static final double MINIMAL_CONFIDENCE = 0.9d;
    private static final int SHORT_ALGO_THRESHOLD = 50;
    private static final int CONSIDER_ONLY_PREFERRED_THRESHOLD = 50;
    private static final float FASTTEXT_CONFIDENCE_THRESHOLD = 0.85f;
    private final LanguageDetector languageDetector;
    private final TextObjectFactory textObjectFactory;
    private final AtomicInteger fasttextInitCounter;
    private FastTextDetector fastTextDetector;
    private NGramDetector ngram;
    private static final Logger logger = LoggerFactory.getLogger((Class<?>) DefaultLanguageIdentifier.class);
    private static final List<String> ignoreLangCodes = Arrays.asList("ast", "gl");
    private static final List<String> externalLangCodes = Arrays.asList("eo", "crh");

    /* JADX INFO: Access modifiers changed from: package-private */
    public DefaultLanguageIdentifier() {
        this(DescriptorProtos.Edition.EDITION_2023_VALUE);
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public DefaultLanguageIdentifier(int i) {
        super(i);
        this.fasttextInitCounter = new AtomicInteger(0);
        try {
            this.languageDetector = LanguageDetectorBuilder.create(NgramExtractors.standard()).minimalConfidence(MINIMAL_CONFIDENCE).shortTextAlgorithm(50).withProfiles(loadProfiles(getLanguageCodes())).build();
            this.textObjectFactory = new TextObjectFactoryBuilder().maxTextLength(10000).withTextFilter(LanguageIdentifier.REMOVE_URL_FILTER).withTextFilter(RemoveMinorityScriptsTextFilter.forThreshold(0.3d)).withTextFilter(LanguageIdentifier.REMOVE_EMAIL_SIGNATURE_FILTER).withTextFilter(LanguageIdentifier.REMOVE_MENTION_FILTER).withTextFilter(LanguageIdentifier.REMOVE_NON_BREAKING_SPACES_FILTER).build();
        } catch (IOException e) {
            throw new RuntimeException("Could not set up language identifier", e);
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public void enableFasttext(File file, File file2) {
        if (file == null || file2 == null) {
            logger.warn("fastText not configured - language detection performance will be degraded. See https://dev.languagetool.org/http-server#starting-from-command-line for instructions.");
            return;
        }
        try {
            this.fastTextDetector = new FastTextDetector(file2, file);
            logger.info("Started fastText process for language identification: Binary {} with model @ {}", file, file2);
        } catch (IOException e) {
            throw new RuntimeException("Could not start fasttext process for language identification @ " + file + " with model @ " + file2, e);
        }
    }

    public void setFastTextDetector(FastTextDetector fastTextDetector) {
        this.fastTextDetector = fastTextDetector;
    }

    public AtomicInteger getFasttextInitCounter() {
        return this.fasttextInitCounter;
    }

    public boolean isFastTextEnabled() {
        return this.fastTextDetector != null;
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public void enableNgrams(File file) {
        if (file != null) {
            try {
                logger.info("Loading ngram data for language identification from " + file + "...");
                this.ngram = new NGramDetector(file, 50);
                logger.info("Loaded ngram data for language identification from " + file);
            } catch (IOException e) {
                throw new RuntimeException("Could not load ngram data language identification from " + file, e);
            }
        }
    }

    private static List<String> getLanguageCodes() {
        ArrayList arrayList = new ArrayList();
        for (Language language : Languages.get()) {
            String shortCode = language.getShortCode();
            if (!(language.isVariant() || ignoreLangCodes.contains(shortCode) || externalLangCodes.contains(shortCode))) {
                if ("zh".equals(shortCode)) {
                    arrayList.add("zh-CN");
                    arrayList.add("zh-TW");
                } else if (!arrayList.contains(shortCode)) {
                    arrayList.add(shortCode);
                }
            }
        }
        return arrayList;
    }

    private List<LanguageProfile> loadProfiles(List<String> list) throws IOException {
        List<LanguageProfile> read = new LanguageProfileReader().read(list);
        for (String str : externalLangCodes) {
            String str2 = "/" + str + "/" + str + ".profile";
            if (JLanguageTool.getDataBroker().resourceExists(str2)) {
                InputStream fromResourceDirAsStream = JLanguageTool.getDataBroker().getFromResourceDirAsStream(str2);
                Throwable th = null;
                try {
                    try {
                        read.add(new LanguageProfileReader().read(fromResourceDirAsStream));
                        if (fromResourceDirAsStream != null) {
                            if (0 != 0) {
                                try {
                                    fromResourceDirAsStream.close();
                                } catch (Throwable th2) {
                                    th.addSuppressed(th2);
                                }
                            } else {
                                fromResourceDirAsStream.close();
                            }
                        }
                    } catch (Throwable th3) {
                        if (fromResourceDirAsStream != null) {
                            if (th != null) {
                                try {
                                    fromResourceDirAsStream.close();
                                } catch (Throwable th4) {
                                    th.addSuppressed(th4);
                                }
                            } else {
                                fromResourceDirAsStream.close();
                            }
                        }
                        throw th3;
                    }
                } finally {
                }
            }
        }
        return read;
    }

    @Override // org.languagetool.language.identifier.LanguageIdentifier
    @Nullable
    public Language detectLanguage(String str) {
        DetectedLanguage detectLanguage = detectLanguage(str, Collections.emptyList(), Collections.emptyList());
        if (detectLanguage == null) {
            return null;
        }
        return detectLanguage.getDetectedLanguage();
    }

    @Override // org.languagetool.language.identifier.LanguageIdentifier
    public DetectedLanguage detectLanguage(String str, List<String> list, List<String> list2) {
        return detectLanguage(str, list, list2, false);
    }

    @Override // org.languagetool.language.identifier.LanguageIdentifier
    @Nullable
    public DetectedLanguage detectLanguage(String str, List<String> list, List<String> list2, boolean z) {
        return getDetectedLanguageScores(str, list, list2, z, 1).stream().findFirst().orElse(null);
    }

    @Override // org.languagetool.language.identifier.LanguageIdentifier
    @NotNull
    public List<DetectedLanguage> getDetectedLanguageScores(String str, List<String> list, List<String> list2, boolean z, int i) {
        String str2 = str;
        LanguageIdentifier.ParsedLanguageLists prepareDetectLanguage = prepareDetectLanguage(str2, list, list2);
        if (prepareDetectLanguage == null) {
            return Collections.singletonList(new DetectedLanguage(null, new NoopLanguage()));
        }
        List<String> additionalLangs = prepareDetectLanguage.getAdditionalLangs();
        List<String> preferredLangs = prepareDetectLanguage.getPreferredLangs();
        Map<String, Double> map = null;
        boolean z2 = false;
        String str3 = "";
        if (this.fastTextDetector != null || this.ngram != null) {
            try {
                boolean z3 = false;
                if ((str2.length() <= 50 || this.fastTextDetector == null) && this.ngram != null) {
                    map = this.ngram.detectLanguages(str2.trim(), additionalLangs);
                    str3 = str3 + "ngram";
                } else {
                    z3 = true;
                    map = this.fastTextDetector.runFasttext(str2, additionalLangs);
                    str3 = str3 + "fasttext";
                }
                Map.Entry<String, Double> highestScoringResult = getHighestScoringResult(map);
                if ((z3 && highestScoringResult.getValue().floatValue() < FASTTEXT_CONFIDENCE_THRESHOLD) || highestScoringResult.getKey().equals(NoopLanguage.SHORT_CODE)) {
                    Map<Language, Integer> knownWordsPerLanguage = COMMON_WORDS_LANG_IDENTIFIER.getKnownWordsPerLanguage(str2);
                    HashSet hashSet = new HashSet();
                    Iterator<Map.Entry<Language, Integer>> it = knownWordsPerLanguage.entrySet().iterator();
                    while (it.hasNext()) {
                        String shortCode = it.next().getKey().getShortCode();
                        if (!hashSet.contains(shortCode)) {
                            hashSet.add(shortCode);
                            if (map.containsKey(shortCode)) {
                                map.put(shortCode, Double.valueOf(map.get(shortCode).doubleValue() + Double.valueOf(r0.getValue().intValue()).doubleValue()));
                            } else {
                                map.put(shortCode, Double.valueOf(r0.getValue().intValue()));
                            }
                        }
                    }
                    str3 = str3 + "+commonwords";
                }
                if (preferredLangs.contains("no") && !preferredLangs.contains("da")) {
                    map.keySet().removeIf(str4 -> {
                        return str4.equals("da");
                    });
                }
                if (!preferredLangs.isEmpty() && (str2.length() <= 50 || z)) {
                    if (map.keySet().removeIf(str5 -> {
                        return !preferredLangs.contains(str5);
                    }) && map.isEmpty() && z) {
                        logger.warn("No language detected for text after remove all not preferred languages from score.");
                    }
                    str3 = str3 + "+prefLang(forced: " + z + Parse.BRACKET_RRB;
                }
            } catch (FastTextDetector.FastTextException e) {
                if (e.isDisabled()) {
                    z2 = true;
                    reinitFasttextAfterFailure(e);
                } else {
                    logger.error("Fasttext failed, fallback used", (Throwable) e);
                    z2 = true;
                }
            } catch (Exception e2) {
                z2 = true;
                reinitFasttextAfterFailure(e2);
            }
        }
        if ((this.fastTextDetector == null && this.ngram == null) || z2) {
            str2 = this.textObjectFactory.forText(str2).toString();
            str3 = str3 + "+fallback";
            if (map == null) {
                map = new HashMap<>();
            }
            Map.Entry<String, Double> detectLanguageCode = detectLanguageCode(str2, preferredLangs, z);
            if (detectLanguageCode != null) {
                map.put(detectLanguageCode.getKey(), detectLanguageCode.getValue());
            }
            if (!additionalLangs.isEmpty()) {
                logger.warn("Cannot consider noopLanguages because not in fastText mode: {}", additionalLangs);
            }
        }
        LinkedList linkedList = new LinkedList();
        if (i > 1) {
            for (Map.Entry<String, Double> entry : getOrderedScores(map, i).entrySet()) {
                if (entry.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(entry.getKey(), additionalLangs)) {
                    linkedList.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(entry.getKey(), additionalLangs), ((float) Math.round(entry.getValue().doubleValue() * 100.0d)) / 100.0f, str3));
                }
            }
        } else {
            Map.Entry<String, Double> highestScoringResult2 = getHighestScoringResult(map);
            if (highestScoringResult2.getKey() != null && LanguageIdentifierService.INSTANCE.canLanguageBeDetected(highestScoringResult2.getKey(), additionalLangs)) {
                linkedList.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(highestScoringResult2.getKey(), additionalLangs), str3.contains("fasttext") ? (float) (0.99d / (30.0d / Math.min(str2.length(), 30))) : highestScoringResult2.getValue().floatValue(), str3));
            }
        }
        if (linkedList.isEmpty() && !preferredLangs.isEmpty() && preferredLangs.get(0) != null && !preferredLangs.get(0).trim().isEmpty() && Languages.isLanguageSupported(preferredLangs.get(0))) {
            linkedList.add(new DetectedLanguage(null, Languages.getLanguageForShortCode(preferredLangs.get(0)), 0.1f, str3 + "+fallbackToPrefLang"));
        }
        return linkedList;
    }

    private void reinitFasttextAfterFailure(Exception exc) {
        if (this.fastTextDetector != null) {
            int incrementAndGet = this.fasttextInitCounter.incrementAndGet();
            try {
                if (this.fastTextDetector.restartProcess()) {
                    logger.debug("Fasttext was new initialized after failure {}", Integer.valueOf(incrementAndGet));
                } else {
                    this.fasttextInitCounter.decrementAndGet();
                }
            } catch (IOException e) {
                logger.warn("Restarting fasttext failed {}", Integer.valueOf(incrementAndGet));
            }
        }
    }

    @Nullable
    private Map.Entry<String, Double> detectLanguageCode(String str, List<String> list, boolean z) {
        List<com.optimaize.langdetect.DetectedLanguage> probabilities = this.languageDetector.getProbabilities(str);
        if (z && list != null && !list.isEmpty() && probabilities.removeIf(detectedLanguage -> {
            return !list.contains(detectedLanguage.getLocale().getLanguage());
        }) && probabilities.isEmpty()) {
            logger.warn("No language detected for text after remove all not preferred languages from score.");
        }
        if (probabilities.size() > 0) {
            return new AbstractMap.SimpleImmutableEntry(probabilities.get(0).getLocale().getLanguage(), Double.valueOf(probabilities.get(0).getProbability()));
        }
        return null;
    }
}
