package org.apache.tika.eval.app;

import com.ibm.icu.text.PluralRules;
import java.io.IOException;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.ArrayBlockingQueue;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.Options;
import org.apache.commons.io.FilenameUtils;
import org.apache.tika.batch.FileResource;
import org.apache.tika.batch.builders.BatchProcessBuilder;
import org.apache.tika.batch.fs.FSProperties;
import org.apache.tika.eval.app.batch.ExtractComparerBuilder;
import org.apache.tika.eval.app.db.ColInfo;
import org.apache.tika.eval.app.db.Cols;
import org.apache.tika.eval.app.db.TableInfo;
import org.apache.tika.eval.app.io.ExtractReader;
import org.apache.tika.eval.app.io.ExtractReaderException;
import org.apache.tika.eval.app.io.IDBWriter;
import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator;
import org.apache.tika.eval.core.tokens.ContrastStatistics;
import org.apache.tika.eval.core.tokens.TokenContraster;
import org.apache.tika.eval.core.tokens.TokenCounts;
import org.apache.tika.eval.core.tokens.TokenIntPair;
import org.apache.tika.eval.core.util.ContentTags;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;

/* loaded from: input_file:org/apache/tika/eval/app/ExtractComparer.class */
public class ExtractComparer extends AbstractProfiler {
    private static final String DIGEST_KEY_PREFIX = "X-TIKA:digest:";
    private static final String FIELD_A = "fa";
    private static final String FIELD_B = "fb";
    public static TableInfo REF_PAIR_NAMES = new TableInfo("pair_names", new ColInfo(Cols.DIR_NAME_A, 12, (Integer) 128), new ColInfo(Cols.DIR_NAME_B, 12, (Integer) 128));
    public static TableInfo COMPARISON_CONTAINERS = new TableInfo("containers", new ColInfo(Cols.CONTAINER_ID, 4, "PRIMARY KEY"), new ColInfo(Cols.FILE_PATH, 12, (Integer) 1024), new ColInfo(Cols.FILE_EXTENSION, 12, (Integer) 12), new ColInfo(Cols.LENGTH, -5), new ColInfo(Cols.EXTRACT_FILE_LENGTH_A, -5), new ColInfo(Cols.EXTRACT_FILE_LENGTH_B, -5));
    public static TableInfo CONTENT_COMPARISONS = new TableInfo("content_comparisons", new ColInfo(Cols.ID, 4, "PRIMARY KEY"), new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, 12, (Integer) 1024), new ColInfo(Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, 12, (Integer) 1024), new ColInfo(Cols.TOP_10_MORE_IN_A, 12, (Integer) 1024), new ColInfo(Cols.TOP_10_MORE_IN_B, 12, (Integer) 1024), new ColInfo(Cols.DICE_COEFFICIENT, 6), new ColInfo(Cols.OVERLAP, 6));
    public static TableInfo PROFILES_A = new TableInfo("profiles_a", ExtractProfiler.PROFILE_TABLE.getColInfos());
    public static TableInfo PROFILES_B = new TableInfo("profiles_b", ExtractProfiler.PROFILE_TABLE.getColInfos());
    public static TableInfo EMBEDDED_FILE_PATH_TABLE_A = new TableInfo("emb_path_a", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
    public static TableInfo EMBEDDED_FILE_PATH_TABLE_B = new TableInfo("emb_path_b", ExtractProfiler.EMBEDDED_FILE_PATH_TABLE.getColInfos());
    public static TableInfo CONTENTS_TABLE_A = new TableInfo("contents_a", ExtractProfiler.CONTENTS_TABLE.getColInfos());
    public static TableInfo CONTENTS_TABLE_B = new TableInfo("contents_b", ExtractProfiler.CONTENTS_TABLE.getColInfos());
    public static TableInfo TAGS_TABLE_A = new TableInfo("tags_a", ExtractProfiler.TAGS_TABLE.getColInfos());
    public static TableInfo TAGS_TABLE_B = new TableInfo("tags_b", ExtractProfiler.TAGS_TABLE.getColInfos());
    public static TableInfo EXCEPTION_TABLE_A = new TableInfo("exceptions_a", ExtractProfiler.EXCEPTION_TABLE.getColInfos());
    public static TableInfo EXCEPTION_TABLE_B = new TableInfo("exceptions_b", ExtractProfiler.EXCEPTION_TABLE.getColInfos());
    public static TableInfo EXTRACT_EXCEPTION_TABLE_A = new TableInfo("extract_exceptions_a", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
    public static TableInfo EXTRACT_EXCEPTION_TABLE_B = new TableInfo("extract_exceptions_b", ExtractProfiler.EXTRACT_EXCEPTION_TABLE.getColInfos());
    static Options OPTIONS;
    private final Path inputDir;
    private final Path extractsA;
    private final Path extractsB;
    private final TokenContraster tokenContraster;
    private final ExtractReader extractReader;

    public ExtractComparer(ArrayBlockingQueue<FileResource> arrayBlockingQueue, Path path, Path path2, Path path3, ExtractReader extractReader, IDBWriter iDBWriter) {
        super(arrayBlockingQueue, iDBWriter);
        this.tokenContraster = new TokenContraster();
        this.inputDir = path;
        this.extractsA = path2;
        this.extractsB = path3;
        this.extractReader = extractReader;
    }

    public static void USAGE() {
        new HelpFormatter().printHelp(80, "java -jar tika-eval-x.y.jar Compare -extractsA extractsA -extractsB extractsB -db mydb", "Tool: Compare", OPTIONS, "Note: for the default h2 db, do not include the .mv.db at the end of the db name.");
    }

    @Override // org.apache.tika.batch.FileResourceConsumer
    public boolean processFileResource(FileResource fileResource) {
        EvalFilePaths pathsFromSrcCrawl;
        EvalFilePaths pathsFromSrcCrawl2;
        Metadata metadata = fileResource.getMetadata();
        if (this.inputDir == null || !(this.inputDir.equals(this.extractsA) || this.inputDir.equals(this.extractsB))) {
            pathsFromSrcCrawl = getPathsFromSrcCrawl(metadata, this.inputDir, this.extractsA);
            pathsFromSrcCrawl2 = getPathsFromSrcCrawl(metadata, this.inputDir, this.extractsB);
        } else {
            pathsFromSrcCrawl = getPathsFromExtractCrawl(metadata, this.extractsA);
            pathsFromSrcCrawl2 = getPathsFromExtractCrawl(metadata, this.extractsB);
        }
        try {
            compareFiles(pathsFromSrcCrawl, pathsFromSrcCrawl2);
            return true;
        } catch (Throwable th) {
            throw new RuntimeException("Exception while working on: " + metadata.get(FSProperties.FS_REL_PATH), th);
        }
    }

    protected void compareFiles(EvalFilePaths evalFilePaths, EvalFilePaths evalFilePaths2) throws IOException {
        ExtractReaderException.TYPE type = null;
        ExtractReaderException.TYPE type2 = null;
        List<Metadata> list = null;
        if (0 == 0) {
            try {
                list = this.extractReader.loadExtract(evalFilePaths.getExtractFile());
            } catch (ExtractReaderException e) {
                e.printStackTrace();
                type = e.getType();
            }
        }
        List<Metadata> list2 = null;
        try {
            list2 = this.extractReader.loadExtract(evalFilePaths2.getExtractFile());
        } catch (ExtractReaderException e2) {
            type2 = e2.getType();
        }
        HashSet hashSet = new HashSet();
        String num = Integer.toString(ID.getAndIncrement());
        HashMap hashMap = new HashMap();
        hashMap.put(Cols.CONTAINER_ID, num);
        hashMap.put(Cols.FILE_PATH, evalFilePaths.getRelativeSourceFilePath().toString());
        long sourceFileLength = getSourceFileLength(list, list2);
        hashMap.put(Cols.LENGTH, sourceFileLength > -1 ? Long.toString(sourceFileLength) : "");
        hashMap.put(Cols.FILE_EXTENSION, FilenameUtils.getExtension(evalFilePaths.getRelativeSourceFilePath().getFileName().toString()));
        long fileLength = getFileLength(evalFilePaths.getExtractFile());
        hashMap.put(Cols.EXTRACT_FILE_LENGTH_A, fileLength > -1 ? Long.toString(fileLength) : "");
        long fileLength2 = getFileLength(evalFilePaths2.getExtractFile());
        hashMap.put(Cols.EXTRACT_FILE_LENGTH_B, fileLength2 > -1 ? Long.toString(fileLength2) : "");
        this.writer.writeRow(COMPARISON_CONTAINERS, hashMap);
        if (type != null) {
            writeExtractException(EXTRACT_EXCEPTION_TABLE_A, num, evalFilePaths.getRelativeSourceFilePath().toString(), type);
        }
        if (type2 != null) {
            writeExtractException(EXTRACT_EXCEPTION_TABLE_B, num, evalFilePaths2.getRelativeSourceFilePath().toString(), type2);
        }
        if (list == null && list2 == null) {
            return;
        }
        List<Integer> countAttachments = countAttachments(list);
        List<Integer> countAttachments2 = countAttachments(list2);
        String findSharedDigestKey = findSharedDigestKey(list, list2);
        if (list != null) {
            int i = 0;
            while (i < list.size()) {
                String num2 = i == 0 ? num : Integer.toString(ID.getAndIncrement());
                Metadata metadata = list.get(i);
                ContentTags content = getContent(evalFilePaths, metadata);
                ContentTags contentTags = ContentTags.EMPTY_CONTENT_TAGS;
                Metadata metadata2 = null;
                writeTagData(num2, content, TAGS_TABLE_A);
                writeProfileData(evalFilePaths, i, content, metadata, num2, num, countAttachments, PROFILES_A);
                writeExceptionData(num2, metadata, EXCEPTION_TABLE_A);
                int match = getMatch(i, findSharedDigestKey, hashSet, list, list2);
                if (match > -1 && !hashSet.contains(Integer.valueOf(match))) {
                    metadata2 = list2.get(match);
                    hashSet.add(Integer.valueOf(match));
                }
                if (metadata2 != null) {
                    contentTags = getContent(evalFilePaths2, metadata2);
                    writeTagData(num2, contentTags, TAGS_TABLE_B);
                    writeProfileData(evalFilePaths2, i, contentTags, metadata2, num2, num, countAttachments2, PROFILES_B);
                    writeExceptionData(num2, metadata2, EXCEPTION_TABLE_B);
                }
                writeEmbeddedFilePathData(i, num2, metadata, metadata2);
                try {
                    Map<Class, Object> calcTextStats = calcTextStats(content);
                    writeContentData(num2, calcTextStats, CONTENTS_TABLE_A);
                    Map<Class, Object> calcTextStats2 = calcTextStats(contentTags);
                    if (metadata2 != null) {
                        writeContentData(num2, calcTextStats2, CONTENTS_TABLE_B);
                    }
                    if (metadata2 != null) {
                        TokenCounts tokenCounts = (TokenCounts) calcTextStats.get(BasicTokenCountStatsCalculator.class);
                        TokenCounts tokenCounts2 = (TokenCounts) calcTextStats2.get(BasicTokenCountStatsCalculator.class);
                        if (tokenCounts.getTotalTokens() + tokenCounts2.getTotalTokens() > 10) {
                            HashMap hashMap2 = new HashMap();
                            hashMap2.put(Cols.ID, num2);
                            writeContrasts(hashMap2, this.tokenContraster.calculateContrastStatistics(tokenCounts, tokenCounts2));
                            this.writer.writeRow(CONTENT_COMPARISONS, hashMap2);
                        }
                    }
                    i++;
                } catch (IOException e3) {
                    throw new RuntimeException(e3);
                }
            }
        }
        if (list2 != null) {
            int i2 = 0;
            while (i2 < list2.size()) {
                if (!hashSet.contains(Integer.valueOf(i2))) {
                    Metadata metadata3 = list2.get(i2);
                    ContentTags content2 = getContent(evalFilePaths2, metadata3);
                    String num3 = i2 == 0 ? num : Integer.toString(ID.getAndIncrement());
                    writeTagData(num3, content2, TAGS_TABLE_B);
                    writeProfileData(evalFilePaths2, i2, content2, metadata3, num3, num, countAttachments2, PROFILES_B);
                    writeEmbeddedFilePathData(i2, num3, null, metadata3);
                    writeExceptionData(num3, metadata3, EXCEPTION_TABLE_B);
                    try {
                        writeContentData(num3, calcTextStats(content2), CONTENTS_TABLE_B);
                    } catch (IOException e4) {
                        throw new RuntimeException(e4);
                    }
                }
                i2++;
            }
        }
    }

    private String findSharedDigestKey(List<Metadata> list, List<Metadata> list2) {
        if (list2 == null || list2.size() == 0) {
            return null;
        }
        HashSet hashSet = new HashSet();
        if (list != null && list.size() > 0) {
            for (String str : list.get(0).names()) {
                if (str.startsWith(DIGEST_KEY_PREFIX)) {
                    hashSet.add(str);
                }
            }
        }
        for (String str2 : list2.get(0).names()) {
            if (hashSet.contains(str2)) {
                return str2;
            }
        }
        return null;
    }

    private void writeEmbeddedFilePathData(int i, String str, Metadata metadata, Metadata metadata2) {
        if (i == 0) {
            return;
        }
        String str2 = null;
        String str3 = null;
        if (metadata != null) {
            str2 = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
        }
        if (metadata2 != null) {
            str3 = metadata2.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
        }
        if (str2 != null) {
            HashMap hashMap = new HashMap();
            hashMap.put(Cols.ID, str);
            hashMap.put(Cols.EMBEDDED_FILE_PATH, str2);
            try {
                this.writer.writeRow(EMBEDDED_FILE_PATH_TABLE_A, hashMap);
            } catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
        if (str3 != null) {
            if (str2 == null || !str2.equals(str3)) {
                HashMap hashMap2 = new HashMap();
                hashMap2.put(Cols.ID, str);
                hashMap2.put(Cols.EMBEDDED_FILE_PATH, str3);
                try {
                    this.writer.writeRow(EMBEDDED_FILE_PATH_TABLE_B, hashMap2);
                } catch (IOException e2) {
                    throw new RuntimeException(e2);
                }
            }
        }
    }

    private long getSourceFileLength(List<Metadata> list, List<Metadata> list2) {
        long sourceFileLength = getSourceFileLength(list);
        return sourceFileLength > -1 ? sourceFileLength : getSourceFileLength(list2);
    }

    private int getMatch(int i, String str, Set<Integer> set, List<Metadata> list, List<Metadata> list2) {
        if (list2 == null || list2.size() == 0) {
            return -1;
        }
        if (i == 0) {
            return 0;
        }
        if (str != null) {
            return findMatchingDigests(str, set, list.get(i), list2);
        }
        String str2 = list.get(i).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH);
        if (str2 != null) {
            for (int i2 = 0; i2 < list2.size(); i2++) {
                if (str2.equals(list2.get(i2).get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))) {
                    return i2;
                }
            }
        }
        if (list.size() == list2.size()) {
            return i;
        }
        return -1;
    }

    private int findMatchingDigests(String str, Set<Integer> set, Metadata metadata, List<Metadata> list) {
        String str2 = metadata.get(str);
        if (str2 == null) {
            return -1;
        }
        String str3 = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
        int i = -1;
        for (int i2 = 0; i2 < list.size(); i2++) {
            if (!set.contains(Integer.valueOf(i2))) {
                Metadata metadata2 = list.get(i2);
                if (str2.equalsIgnoreCase(metadata2.get(str))) {
                    i = i2;
                    if (str3 != null && str3.equals(metadata2.get(TikaCoreProperties.RESOURCE_NAME_KEY))) {
                        return i2;
                    }
                } else {
                    continue;
                }
            }
        }
        return i;
    }

    private void writeContrasts(Map<Cols, String> map, ContrastStatistics contrastStatistics) {
        writeContrastString(map, Cols.TOP_10_MORE_IN_A, contrastStatistics.getTopNMoreA());
        writeContrastString(map, Cols.TOP_10_MORE_IN_B, contrastStatistics.getTopNMoreB());
        writeContrastString(map, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_A, contrastStatistics.getTopNUniqueA());
        writeContrastString(map, Cols.TOP_10_UNIQUE_TOKEN_DIFFS_B, contrastStatistics.getTopNUniqueB());
        map.put(Cols.OVERLAP, Double.toString(contrastStatistics.getOverlap()));
        map.put(Cols.DICE_COEFFICIENT, Double.toString(contrastStatistics.getDiceCoefficient()));
    }

    private void writeContrastString(Map<Cols, String> map, Cols cols, TokenIntPair[] tokenIntPairArr) {
        int i = 0;
        StringBuilder sb = new StringBuilder();
        for (TokenIntPair tokenIntPair : tokenIntPairArr) {
            int i2 = i;
            i++;
            if (i2 > 0) {
                sb.append(" | ");
            }
            sb.append(tokenIntPair.getToken()).append(PluralRules.KEYWORD_RULE_SEPARATOR).append(tokenIntPair.getValue());
        }
        map.put(cols, sb.toString());
    }

    static {
        Option option = new Option("extractsA", true, "directory for extractsA files");
        option.setRequired(true);
        Option option2 = new Option("extractsB", true, "directory for extractsB files");
        option2.setRequired(true);
        OPTIONS = new Options().addOption(option).addOption(option2).addOption(new Option("inputDir", true, "optional: directory of original binary input files if it exists or can be the same as -extractsA or -extractsB. If not specified, -inputDir=-extractsA")).addOption("bc", "optional: tika-batch config file").addOption(BatchProcessBuilder.NUM_CONSUMERS_KEY, true, "optional: number of consumer threads").addOption(new Option("alterExtract", true, "for json-formatted extract files, process full metadata list ('as_is'=default), take just the first/container document ('first_only'), concatenate all content into the first metadata item ('concatenate_content')")).addOption("minExtractLength", true, "minimum extract length to process (in bytes)").addOption("maxExtractLength", true, "maximum extract length to process (in bytes)").addOption("db", true, "db file to which to write results").addOption("jdbc", true, "EXPERT: full jdbc connection string. Must specify this or -db <h2db>").addOption("jdbcDriver", true, "EXPERT: jdbc driver, or specify via -Djdbc.driver").addOption(ExtractComparerBuilder.TABLE_PREFIX_A_KEY, true, "EXPERT: optional prefix for table names for A").addOption(ExtractComparerBuilder.TABLE_PREFIX_B_KEY, true, "EXPERT: optional prefix for table names for B").addOption("drop", false, "drop tables if they exist").addOption("maxFilesToAdd", true, "maximum number of files to add to the crawler").addOption("maxTokens", true, "maximum tokens to process, default=200000").addOption("maxContentLength", true, "truncate content beyond this length for calculating 'contents' stats, default=1000000").addOption("maxContentLengthForLangId", true, "truncate content beyond this length for language id, default=50000").addOption("defaultLangCode", true, "which language to use for common words if no 'common words' file exists for the langid result");
    }
}
