package org.apache.tika.eval.langid;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import opennlp.tools.langdetect.LanguageDetector;
import opennlp.tools.langdetect.LanguageDetectorModel;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
import org.apache.tika.eval.textstats.StringStatsCalculator;
import org.h2.api.ErrorCode;

/* loaded from: input_file:org/apache/tika/eval/langid/LanguageIDWrapper.class */
public class LanguageIDWrapper implements StringStatsCalculator<List<Language>> {
    static LanguageDetectorModel LANG_MODEL;
    static int MAX_TEXT_LENGTH = ErrorCode.GENERAL_ERROR_1;
    private final LanguageDetector detector;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/eval/langid/LanguageIDWrapper$AlphaIdeographSequenceNormalizer.class */
    public static class AlphaIdeographSequenceNormalizer implements CharSequenceNormalizer {
        private static final Pattern REGEX = Pattern.compile("[^\\p{IsAlphabetic}\\p{IsIdeographic}]+");
        private static final AlphaIdeographSequenceNormalizer INSTANCE = new AlphaIdeographSequenceNormalizer();

        public static AlphaIdeographSequenceNormalizer getInstance() {
            return INSTANCE;
        }

        private AlphaIdeographSequenceNormalizer() {
        }

        @Override // opennlp.tools.util.normalizer.CharSequenceNormalizer
        public CharSequence normalize(CharSequence charSequence) {
            return REGEX.matcher(charSequence).replaceAll(" ");
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/eval/langid/LanguageIDWrapper$TikaUrlCharSequenceNormalizer.class */
    public static class TikaUrlCharSequenceNormalizer implements CharSequenceNormalizer {
        private static final Pattern URL_REGEX = Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]{10,10000}");
        private static final Pattern MAIL_REGEX = Pattern.compile("[-_.0-9A-Za-z]{1,100}@[-_0-9A-Za-z]{1,100}[-_.0-9A-Za-z]{1,100}");
        private static final TikaUrlCharSequenceNormalizer INSTANCE = new TikaUrlCharSequenceNormalizer();

        public static TikaUrlCharSequenceNormalizer getInstance() {
            return INSTANCE;
        }

        private TikaUrlCharSequenceNormalizer() {
        }

        @Override // opennlp.tools.util.normalizer.CharSequenceNormalizer
        public CharSequence normalize(CharSequence charSequence) {
            return MAIL_REGEX.matcher(URL_REGEX.matcher(charSequence).replaceAll(" ")).replaceAll(" ");
        }
    }

    public static synchronized void loadBuiltInModels() throws IOException {
        InputStream resourceAsStream = LanguageIDWrapper.class.getResourceAsStream("/opennlp/model_20190626.bin");
        Throwable th = null;
        try {
            LANG_MODEL = new LanguageDetectorModel(resourceAsStream);
            if (resourceAsStream != null) {
                if (0 == 0) {
                    resourceAsStream.close();
                    return;
                }
                try {
                    resourceAsStream.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
        } catch (Throwable th3) {
            if (resourceAsStream != null) {
                if (0 != 0) {
                    try {
                        resourceAsStream.close();
                    } catch (Throwable th4) {
                        th.addSuppressed(th4);
                    }
                } else {
                    resourceAsStream.close();
                }
            }
            throw th3;
        }
    }

    public static void loadModels(Path path) throws IOException {
        LANG_MODEL = new LanguageDetectorModel(path.toFile());
    }

    private static CharSequenceNormalizer[] getNormalizers() {
        return new CharSequenceNormalizer[]{TikaUrlCharSequenceNormalizer.getInstance(), AlphaIdeographSequenceNormalizer.getInstance(), EmojiCharSequenceNormalizer.getInstance(), TwitterCharSequenceNormalizer.getInstance(), NumberCharSequenceNormalizer.getInstance(), ShrinkCharSequenceNormalizer.getInstance()};
    }

    public LanguageIDWrapper() {
        if (LANG_MODEL == null) {
            try {
                loadBuiltInModels();
            } catch (IOException e) {
                throw new RuntimeException("couldn't load built in lang models", e);
            }
        }
        this.detector = new ProbingLanguageDetector(LANG_MODEL, getNormalizers());
    }

    public List<Language> getProbabilities(String str) {
        opennlp.tools.langdetect.Language[] predictLanguages = this.detector.predictLanguages(str);
        ArrayList arrayList = new ArrayList();
        for (int i = 0; i < predictLanguages.length; i++) {
            arrayList.add(new Language(predictLanguages[i].getLang(), predictLanguages[i].getConfidence()));
        }
        return arrayList;
    }

    public String[] getSupportedLanguages() {
        return this.detector.getSupportedLanguages();
    }

    public static void setMaxTextLength(int i) {
        MAX_TEXT_LENGTH = i;
    }

    /* JADX WARN: Can't rename method to resolve collision */
    @Override // org.apache.tika.eval.textstats.StringStatsCalculator
    public List<Language> calculate(String str) {
        return getProbabilities(str);
    }
}
