[jira] [Commented] (TIKA-1332) Create "eval" code

Previous Topic Next Topic
 
classic Classic list List threaded Threaded
1 message Options
Reply | Threaded
Open this post in threaded view
|

[jira] [Commented] (TIKA-1332) Create "eval" code

JIRA jira@apache.org

    [ https://issues.apache.org/jira/browse/TIKA-1332?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=15870351#comment-15870351 ]

Hudson commented on TIKA-1332:
------------------------------

UNSTABLE: Integrated in Jenkins build Tika-trunk #1198 (See [https://builds.apache.org/job/Tika-trunk/1198/])
TIKA-1332 -- initial commit for tika-eval module. More work remains. (tallison: rev aa7a0c353362d56cb1b8e77297f0807626b0246c)
* (add) tika-eval/src/test/java/org/apache/tika/eval/util/MimeUtilTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/ContrastStatistics.java
* (add) tika-eval/src/test/resources/single-file-profiler-crawl-input-config.xml
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file3_attachBNotA.doc
* (add) tika-eval/src/test/resources/log4j.properties
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file2_attachANotB.doc.json
* (add) tika-eval/pom.xml
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file11_oom.txt.json
* (add) tika-eval/src/test/java/org/apache/tika/eval/AnalyzerManagerTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenStatistics.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file3_attachBNotA.doc.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/batch/DBConsumersManager.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/AbstractDBBuffer.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/ColInfo.java
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file5_emptyA.pdf
* (add) tika-eval/src/test/java/org/apache/tika/eval/SimpleComparerTest.java
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file8_IOEx.pdf
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file1.pdf.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/H2Util.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file13_attachANotB.doc.txt
* (add) tika-eval/src/test/java/org/apache/tika/eval/reports/ResultsReporterTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/ExtractComparer.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/CJKBigramAwareLengthFilterFactory.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCountPriorityQueue.java
* (add) tika-eval/src/test/resources/commontokens/zh-cn
* (add) tika-eval/src/main/resources/tika-eval-comparison-config.xml
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenCounter.java
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file12_es.txt.json
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file10_permahang.txt.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/io/ExtractReader.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogReader.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXHREFFormatter.java
* (add) tika-eval/src/main/resources/META-INF/services/org.apache.lucene.analysis.util.TokenFilterFactory
* (add) tika-eval/src/test/resources/single-file-profiler-crawl-extract-config.xml
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenCountManager.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/DBUtil.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenIntPair.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/AlphaIdeographFilterFactory.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/io/XMLLogMsgHandler.java
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file7_badJson.pdf.json
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file2_attachANotB.doc
* (add) tika-eval/src/test/java/org/apache/tika/eval/ProfilerBatchTest.java
* (add) tika-eval/src/test/resources/commontokens/en
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file4_emptyB.pdf.json
* (add) tika-eval/src/test/java/org/apache/tika/eval/ComparerBatchTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerDeserializer.java
* (add) tika-eval/src/test/java/org/apache/tika/eval/tokens/TokenCounterTest.java
* (add) tika-eval/src/test/resources/test-dirs/batch-logs/batch-process-fatal.xml
* (add) tika-eval/src/main/resources/tika-eval-profiler-config.xml
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file1.pdf
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file11_oom.txt
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/TableInfo.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/AnalyzerManager.java
* (add) tika-eval/src/test/resources/commontokens/zh-tw
* (add) tika-eval/src/test/resources/commontokens/es
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file9_noextract.txt
* (add) tika-eval/src/main/java/org/apache/tika/eval/EvalFilePaths.java
* (add) tika-eval/src/test/java/org/apache/tika/eval/db/AbstractBufferTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/TokenContraster.java
* (add) tika-eval/src/main/resources/profile-reports.xml
* (add) tika-eval/src/main/java/org/apache/tika/eval/reports/ResultsReporter.java
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file6_accessEx.pdf.json
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file8_IOEx.pdf.json
* (add) tika-eval/src/test/java/org/apache/tika/eval/tokens/LuceneTokenCounter.java
* (edit) CHANGES.txt
* (add) tika-eval/src/test/java/org/apache/tika/eval/io/ExtractReaderTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/util/LanguageIDWrapper.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file1.pdf.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/Cols.java
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file6_accessEx.pdf
* (add) tika-eval/src/main/java/org/apache/tika/eval/tokens/CommonTokenResult.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/AbstractProfiler.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumersBuilder.java
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file11_oom.txt.json
* (add) tika-eval/src/main/resources/lucene-char-mapping.txt
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file5_emptyA.pdf.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/XMLErrorLogUpdater.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file6_accessEx.pdf.json
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file7_badJson.pdf
* (add) tika-eval/src/test/java/org/apache/tika/MockDBWriter.java
* (edit) LICENSE.txt
* (add) tika-eval/src/main/resources/comparison-reports.xml
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file2_attachANotB.doc.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/batch/FileComparerBuilder.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/io/DBWriter.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/DBBuffer.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/reports/Report.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/reports/XSLXCellFormatter.java
* (add) tika-eval/src/test/java/org/apache/tika/eval/io/FatalExceptionReaderTest.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/io/IDBWriter.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/batch/SingleFileConsumerBuilder.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file12_es.txt.json
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file3_attachBNotA.doc.json
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file4_emptyB.pdf.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/TikaEvalCLI.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/batch/EvalConsumerBuilder.java
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file5_emptyA.pdf.json
* (edit) pom.xml
* (add) tika-eval/src/main/java/org/apache/tika/eval/reports/XLSXNumFormatter.java
* (add) tika-eval/src/main/java/org/apache/tika/eval/ExtractProfiler.java
* (add) tika-eval/src/test/resources/test-dirs/raw_input/file4_emptyB.pdf
* (add) tika-eval/src/test/resources/test-dirs/extractsA/file13_attachANotB.doc.json
* (add) tika-eval/src/main/java/org/apache/tika/eval/db/MimeBuffer.java
* (add) tika-eval/src/test/java/org/apache/tika/eval/TikaEvalCLITest.java
* (add) tika-eval/src/test/resources/log4j_process.properties
* (add) tika-eval/src/main/resources/lucene-analyzers.json
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file7_badJson.pdf.json
* (add) tika-eval/src/test/resources/test-dirs/extractsB/file8_IOEx.pdf.json


> Create "eval" code
> ------------------
>
>                 Key: TIKA-1332
>                 URL: https://issues.apache.org/jira/browse/TIKA-1332
>             Project: Tika
>          Issue Type: Sub-task
>          Components: cli, general, server
>            Reporter: Tim Allison
>            Assignee: Tim Allison
>             Fix For: 2.0, 1.15
>
>         Attachments: comparison_reports.xml
>
>
> For this issue, we can start with code to gather statistics on each run (# of exceptions per file type, most common exceptions per file type, number of metadata items, total text extracted, etc).  We should also be able to compare one run against another.  Going forward, there's plenty of room to improve.



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)