package org.elasticsearch.xpack.ml.inference.nlp.tokenizers;

import java.io.IOException;
import java.io.Reader;
import java.io.UncheckedIOException;
import java.util.List;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.core.WhitespaceTokenizer;
import org.elasticsearch.xpack.ml.MachineLearning;
import org.elasticsearch.xpack.ml.inference.nlp.tokenizers.WordPieceTokenFilter;

/* loaded from: input_file:org/elasticsearch/xpack/ml/inference/nlp/tokenizers/WordPieceAnalyzer.class */
public class WordPieceAnalyzer extends Analyzer {
    private final List<String> vocabulary;
    private final List<String> neverSplit;
    private final boolean doLowerCase;
    private final boolean doTokenizeCjKChars;
    private final boolean doStripAccents;
    private WordPieceTokenFilter innerTokenFilter;
    private final String unknownToken;

    public WordPieceAnalyzer(List<String> list, List<String> list2, boolean z, boolean z2, boolean z3, String str) {
        this.vocabulary = list;
        this.neverSplit = list2;
        this.doLowerCase = z;
        this.doTokenizeCjKChars = z2;
        this.doStripAccents = z3;
        this.unknownToken = str;
    }

    protected Tokenizer createTokenizer() {
        return new WhitespaceTokenizer(MachineLearning.DEFAULT_MAX_OPEN_JOBS_PER_NODE);
    }

    protected Analyzer.TokenStreamComponents createComponents(String str) {
        try {
            Tokenizer createTokenizer = createTokenizer();
            this.innerTokenFilter = WordPieceTokenFilter.build(this.doLowerCase, this.doTokenizeCjKChars, this.doStripAccents, this.neverSplit, this.vocabulary, this.unknownToken, 100, createTokenizer);
            return new Analyzer.TokenStreamComponents(createTokenizer, this.innerTokenFilter);
        } catch (IOException e) {
            throw new UncheckedIOException(e);
        }
    }

    public List<WordPieceTokenFilter.WordPieceToken> getTokens() {
        return this.innerTokenFilter != null ? this.innerTokenFilter.getTokenizedValues() : List.of();
    }

    protected Reader initReader(String str, Reader reader) {
        return new ControlCharFilter(reader);
    }

    protected Reader initReaderForNormalization(String str, Reader reader) {
        return new ControlCharFilter(reader);
    }
}
