This is an automated email from the ASF dual-hosted git repository.
tallison pushed a change to branch TIKA-4662
in repository https://gitbox.apache.org/repos/asf/tika.git
from ae04d333f6 TIKA-4662 -- update common tokens and rebuild model
add 0e53b3cff8 TIKA-4662 -- checkpoint
add eaa968cd29 TIKA-4662 -- checkpoint
add 7fded913f0 TIKA-4662 -- checkpoint
add c4b596f453 TIKA-4662 -- checkpoint
add f9475801d0 TIKA-4662 -- checkpoint
add feff1e1639 TIKA-4662 -- checkpoint
add 28a1e110b1 resolve stash pop conflicts
add d4d441e783 Bump org.springframework:spring-context from 7.0.4 to 7.0.5
(#2643)
add e627b4b3e6 Bump org.apache.maven.plugins:maven-surefire-plugin from
3.5.4 to 3.5.5 (#2644)
add bf4b55424f Bump com.nimbusds:nimbus-jose-jwt from 10.7 to 10.8 (#2640)
add 7568faf8b8 Bump com.fasterxml.jackson:jackson-bom from 2.21.0 to
2.21.1 (#2631)
add 7a43f5ce20 Bump twelvemonkeys.version from 3.13.0 to 3.13.1 (#2630)
add 7616e9ebd8 Bump software.amazon.awssdk:bom from 2.41.29 to 2.41.34
(#2629)
add 2e33ce5214 Bump org.jetbrains:annotations from 26.0.2-1 to 26.1.0
(#2635)
add 4db18e5341 Bump org.apache.kafka:kafka-clients from 4.1.1 to 4.2.0
(#2636)
add e1d3bfe5c2 Bump net.sourceforge.tess4j:tess4j from 5.16.0 to 5.18.0
(#2628)
add c6cd5a71bf Bump google-auth-library-oauth2-http.version from 1.42.1 to
1.43.0 (#2627)
add 372d74f646 Bump com.googlecode.plist:dd-plist from 1.28 to 1.29 (#2625)
add 9fb60a94b8 Bump com.mchange:mchange-commons-java from 0.3.2 to 0.4.0
(#2626)
add 1c62c0ab3d Bump org.apache.maven.plugins:maven-failsafe-plugin from
3.5.4 to 3.5.5 (#2641)
add 3f26230c4a Bump org.jetbrains.kotlin:kotlin-stdlib-jdk8 from 1.9.10 to
2.3.10 (#2632)
add 1f0ca1b9c6 Bump org.jetbrains.kotlin:kotlin-stdlib from 1.9.10 to
2.3.10 (#2633)
add 24daa28bc0 Bump org.codehaus.mojo:flatten-maven-plugin from 1.6.0 to
1.7.3 (#2642)
add acc65b8683 Bump commonmark.version from 0.24.0 to 0.27.1 (#2638)
add bab4893999 Bump org.jetbrains.kotlin:kotlin-stdlib-common from 1.9.10
to 2.0.21 (#2637)
add ac1836b5ef Bump org.jetbrains.kotlin:kotlin-stdlib-jdk7 from 1.9.10 to
2.3.10 (#2639)
add 2a5bb03109 upgrade okhttp (#2646)
add 52fc58d6f8 TIKA-4327: update aws
add e7238901df TIKA-4675 -- improve wide unicode detection (#2647)
add b9b6efae2b TIKA-4327: update c3p0
add 608019c068 TIKA-4327: update aws
add 3b53d0d1dd TIKA-4674 - progress timeout (#2650)
add fdeb82f179 simplify serialization, take 2 (#2651)
add 8c0329132a TIKA-4676 -- refactor inference and fix endian bug in
ESEmitter (#2653)
add 8d8f3440f7 TIKA-4327: update aws, jwarc
add e5151b1e5b TIKA-4327: update google-api
add 2fd8c0eda2 TIKA-4327: update microsoft-graph.version, maven.bundle,
aws, google cloud, junrar, mockito, error_prone_annotations
add 9451da2d56 TIKA-4606: Upgrade Apache Ignite from 2.x to 3.x (fresh)
(#2654)
add 2a9957a12b Bump org.tukaani:xz from 1.11 to 1.12 (#2670)
add 0385b58466 Bump io.swagger.core.v3:swagger-annotations from 2.2.38 to
2.2.43 (#2669)
add 1d46c8b97f Bump org.jetbrains.kotlin:kotlin-stdlib from 2.2.0 to
2.3.10 (#2663)
add aaef3ca7a3 Bump info.picocli:picocli from 4.7.5 to 4.7.7 (#2661)
add b9903d0840 Bump org.jetbrains:annotations from 26.0.2-1 to 26.1.0
(#2659)
add 4ba11a4e19 Bump org.yaml:snakeyaml from 2.4 to 2.6 (#2671)
add 4c9017fca3 Bump jakarta.inject:jakarta.inject-api from 2.0.1 to
2.0.1.MR (#2667)
add 93b5cfa96f TIKA-4488: update micronaut
add d139bfe02e TIKA-4488: add micronaut version
add a7116b05d9 TIKA-4488: add micronaut version
add 27933e64b9 TIKA-4327: add comment
add 9f94799669 TIKA-4327: update tyrus, kiota, solrj, spotless-maven-plugin
add 30e46db4fa TIKA-4606: Add e2e tests for Ignite 3.x upgrade (#2655)
add ca67465e90 TIKA-4327: update aws, swagger, jackrabbit; add comment on
solrj 10 migration
add fdac94fc18 TIKA-4682 4x tweaks (#2674)
add 599a0427a4 TIKA-4327: update aws, zookeeper, shade plugin, azure
add b70d2ba1b0 TIKA-4327: remove dependency that is in parent
add c0e0d8fccc TIKA-4327: remove dependency that is in parent
add 96002a73d4 add md summary and other cli improvements (#2676)
add ad61f26e75 TIKA-4672 - add an Elasticsearch emitter (#2622)
add 9ac817e1c0 clean up bom pom
add bb5a3f660e maybe fix flaky solr tests (#2678)
add 9627b99958 TIKA-4685 chardet (#2677)
add 364e3200e0 TIKA-4327: update aws
add 2610663401 fix tests, revert errant .local-repo setting (#2680)
add b3023c47bc TIKA-4685 - add annotation processor for jdk >23 (#2679)
add 9c69600a72 Bump joda-time:joda-time from 2.14.0 to 2.14.1 (#2689)
add 2f8b753d55 Bump org.apache.maven.plugins:maven-compiler-plugin (#2690)
add 2f8c70ad2d Bump junit6.version from 6.1.0-M1 to 6.0.3 (#2688)
add 9846674af0 Bump org.awaitility:awaitility from 4.2.0 to 4.3.0 (#2687)
add c002b1b9d4 Bump org.apache.maven.plugins:maven-surefire-plugin from
3.5.2 to 3.5.5 (#2686)
add dd4a7e0979 Bump commons-logging:commons-logging from 1.3.5 to 1.3.6
(#2685)
add adf6269947 Bump org.apache.maven:maven-model from 3.9.12 to 3.9.13
(#2684)
add 28f276cccb Bump org.projectlombok:lombok from 1.18.32 to 1.18.42
(#2682)
add ea31f4d44b Bump org.slf4j:slf4j-api from 2.0.16 to 2.0.17 (#2681)
add fe38956b28 Merge main into TIKA-4662
add 09dd09ac5e fix: remove duplicate tika-ml and stale
tika-charset-detectors from root pom.xml
add 7e6844042c fix: restore tika-encoding-detectors files from main
(correct package declarations)
add 0cec71b082 TIKA-4662 -- checkpoint
add 24c1afb8c7 TIKA-4662 -- checkpoint
add f1994788a5 TIKA-4662 -- checkpoint
add bf150690bc TIKA-4327: update pdfbox
add 62158b7e16 TIKA-4614: activate disable part now that pdfbox 3.0.7 has
been released
add ccf1aee21c TIKA-4614: fix javadoc
add b08ba012cf Merge branch 'main' into TIKA-4662
No new revisions were added by this update.
Summary of changes:
.github/workflows/main-jdk17-build.yml | 19 +
.../main-jdk17-windows-build-multi-locale.yml | 2 +-
.github/workflows/main-jdk17-windows-build.yml | 2 +-
.gitignore | 4 +-
.java-version | 18 -
.mvn/maven.config | 1 +
docs/modules/ROOT/nav.adoc | 2 +
.../pages/advanced/charset-detection-design.adoc | 456 +
.../advanced/charsoup-supported-languages.adoc | 175 +
.../advanced/lang-detection/flores-AUTOMATIC.log | 521 +
.../advanced/lang-detection/flores-SHORT_TEXT.log | 399 +
.../advanced/lang-detection/flores-STANDARD.log | 473 +
.../advanced/lang-detection/flores200-dev-eval.md | 187 +
.../lang-detection/language-drop-decisions.md | 158 +
.../short-text-language-decisions.md | 367 +
.../advanced/lang-detection/supported-languages.md | 223 +
.../pages/advanced/language-detection-build.adoc | 552 +-
.../ROOT/pages/advanced/language-detection.adoc | 264 +-
docs/modules/ROOT/pages/pipes/index.adoc | 101 +-
.../ROOT/pages/pipes/shared-server-mode.adoc | 10 +-
docs/modules/ROOT/pages/pipes/timeouts.adoc | 170 +
docs/pom.xml | 14 +
pom.xml | 10 +-
run-lang-train.sh | 30 -
tika-app/pom.xml | 14 +-
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 10 +-
.../test/resources/configs/config-template.json | 6 +-
tika-bom/pom.xml | 30 -
tika-charset-detectors/pom.xml | 43 -
.../tika/detect/encoding/BomEncodingDetector.java | 101 -
.../detect/encoding/ByteNgramFeatureExtractor.java | 155 -
.../tika/detect/encoding/FeatureExtractor.java | 40 -
.../encoding/HttpHeaderEncodingDetector.java | 83 -
.../apache/tika/detect/encoding/LinearModel.java | 372 -
.../tika/detect/encoding/MlEncodingDetector.java | 377 -
.../apache/tika/detect/encoding/Prediction.java | 94 -
.../detect/html/StandardHtmlEncodingDetector.java | 113 -
.../encoding/ByteNgramFeatureExtractorTest.java | 120 -
.../detect/encoding/CharsetConfusablesTest.java | 180 -
.../tika-charset-detectors-icu4j/pom.xml | 58 -
.../encoding/tools/EvalCharsetDetectors.java | 326 -
.../src/main/python/build_charset_training.py | 562 -
.../apache/tika/config/TikaProgressTracker.java | 82 +
.../org/apache/tika/config/TikaTaskTimeout.java | 76 -
.../java/org/apache/tika/config/TimeoutLimits.java | 112 +-
.../org/apache/tika/detect/AutoDetectReader.java | 8 +-
.../java/org/apache/tika/detect}/BOMDetector.java | 35 +-
.../tika/detect/CompositeEncodingDetector.java | 109 +-
.../tika/detect/DefaultEncodingDetector.java | 36 +-
.../org/apache/tika/detect/EncodingDetector.java | 48 +-
.../tika/detect/EncodingDetectorContext.java | 98 +-
.../org/apache/tika/detect/EncodingResult.java | 170 +
.../tika/detect/MetadataCharsetDetector.java | 132 +
.../tika/detect/OverrideEncodingDetector.java | 7 +-
.../tika/language/detect/LanguageDetector.java | 21 +-
.../apache/tika/metadata/TikaCoreProperties.java | 7 +
.../metadata/filter/CompositeMetadataFilter.java | 13 +-
.../tika/metadata/filter/MetadataFilter.java | 35 +-
.../tika/metadata/filter/MetadataFilterBase.java | 3 +-
.../apache/tika/metadata/filter/NoOpFilter.java | 3 +-
.../filter/RemoveByMimeMetadataFilter.java | 3 +-
.../tika/parser/external2/ExternalParser.java | 6 +-
.../org.apache.tika.detect.EncodingDetector | 18 +-
.../tika/config/TikaProgressTrackerTest.java | 103 +
.../org/apache/tika/detect}/BOMDetectorTest.java | 11 +-
tika-e2e-tests/README.md | 12 +-
tika-e2e-tests/pom.xml | 74 +-
tika-e2e-tests/tika-grpc/README.md | 100 +-
tika-e2e-tests/tika-grpc/pom.xml | 52 +-
.../tika/parser/ocr/TesseractOCRConfig.properties | 25 -
.../customocr/tika-config-inline.json | 26 -
.../customocr/tika-config-inline.xml | 49 -
.../customocr/tika-config-rendered.json | 28 -
.../customocr/tika-config-rendered.xml | 55 -
.../tika/parser/journal/GrobidExtractor.properties | 16 -
.../sample-configs/grobid/tika-config.json | 23 -
.../sample-configs/grobid/tika-config.xml | 41 -
.../tika-grpc/sample-configs/ignite/README.md | 117 -
.../sample-configs/ignite/tika-config-ignite.json | 2 +-
.../sample-configs/ner/run_tika_server.sh | 62 -
.../tika-grpc/sample-configs/ner/tika-config.json | 26 -
.../tika-grpc/sample-configs/ner/tika-config.xml | 45 -
.../tika-grpc/sample-configs/test-simple.json | 20 -
.../vision/inception-rest-caption.json | 18 -
.../vision/inception-rest-caption.xml | 32 -
.../vision/inception-rest-video.json | 18 -
.../sample-configs/vision/inception-rest-video.xml | 32 -
.../sample-configs/vision/inception-rest.json | 18 -
.../sample-configs/vision/inception-rest.xml | 32 -
.../org/apache/tika/pipes/ExternalTestBase.java | 285 +-
.../pipes/filesystem/FileSystemFetcherTest.java | 79 +-
.../tika/pipes/ignite/IgniteConfigStoreTest.java | 679 +-
.../java/org/apache/tika/pipes/ignite/README.md | 172 -
.../src/test/resources/docker-compose-ignite.yml | 25 -
.../src/test/resources/docker-compose.yml | 16 -
.../tika-grpc/src/test/resources/log4j2.xml | 19 -
.../src/test/resources/test-fixtures/sample.csv | 4 +
.../src/test/resources/test-fixtures/sample.html | 8 +
.../src/test/resources/test-fixtures/sample.txt | 3 +
.../src/test/resources/test-fixtures/sample.xml | 5 +
...g-ignite.json => tika-config-ignite-local.json} | 4 +-
.../src/test/resources/tika-config-ignite.json | 2 +-
.../tika-grpc/src/test/resources/tika-config.json | 49 +-
{tika-bundles => tika-encoding-detectors}/pom.xml | 45 +-
.../tika-encoding-detector-charsoup}/pom.xml | 47 +-
.../charsoup/CharSoupEncodingDetector.java | 383 +
.../org.apache.tika.detect.EncodingDetector | 8 +-
.../charsoup/CharSoupEncodingDetectorTest.java | 54 +-
.../langdetect/charsoup/TextQualityDiagTest.java | 0
.../tika-encoding-detector-html}/pom.xml | 36 +-
.../tika/parser}/html/HtmlEncodingDetector.java | 25 +-
.../html/charsetdetector}/CharsetAliases.java | 19 +-
.../charsetdetector}/CharsetDetectionResult.java | 2 +-
.../html/charsetdetector}/MetaProcessor.java | 4 +-
.../parser/html/charsetdetector}/PreScanner.java | 15 +-
.../StandardHtmlEncodingDetector.java | 138 +
.../charsets/ReplacementCharset.java | 2 +-
.../charsets/XUserDefinedCharset.java | 2 +-
.../org.apache.tika.detect.EncodingDetector | 3 +-
.../tika-encoding-detector-icu4j}/pom.xml | 58 +-
.../apache/tika/parser/txt/CharsetDetector.java | 0
.../org/apache/tika/parser/txt/CharsetMatch.java | 0
.../apache/tika/parser/txt/CharsetRecog_2022.java | 0
.../apache/tika/parser/txt/CharsetRecog_UTF8.java | 0
.../tika/parser/txt/CharsetRecog_Unicode.java | 0
.../apache/tika/parser/txt/CharsetRecog_mbcs.java | 0
.../apache/tika/parser/txt/CharsetRecog_sbcs.java | 0
.../apache/tika/parser/txt/CharsetRecognizer.java | 0
.../tika/parser/txt}/Icu4jEncodingDetector.java | 53 +-
.../tika/parser/txt/CharsetDetectorTest.java | 1 +
.../configs/tika-config-ignore-charset.json | 0
.../resources/test-documents/multi-language.txt | 0
.../src/test/resources/test-documents/resume.html | 0
.../resources/test-documents/testIgnoreCharset.txt | 0
.../resources/test-documents/testTXT_win-1252.txt | 0
.../test-documents/test_ignore_IBM420.html | Bin
.../tika-encoding-detector-mojibuster}/pom.xml | 36 +-
.../ml/chardetect/ByteNgramFeatureExtractor.java | 290 +
.../tika/ml/chardetect}/CharsetConfusables.java | 111 +-
.../tika/ml/chardetect/CjkEncodingRules.java | 400 +
.../ml/chardetect/MojibusterEncodingDetector.java | 720 +
.../ml/chardetect}/StructuralEncodingRules.java | 228 +-
.../org.apache.tika.detect.EncodingDetector | 6 +-
.../org/apache/tika/ml/chardetect/chardetect.bin | Bin 0 -> 606934 bytes
.../tika/ml/chardetect/EbcdicRoutingTest.java | 104 +
.../ml/chardetect/ZipFilenameDetectionTest.java | 132 +
.../tika-encoding-detector-universal}/pom.xml | 36 +-
.../parser/txt}/UniversalEncodingDetector.java | 53 +-
.../parser/txt}/UniversalEncodingListener.java | 2 +-
tika-eval/tika-eval-app/pom.xml | 29 +-
.../tika-eval-app}/src/main/assembly/assembly.xml | 4 -
.../java/org/apache/tika/eval/app/EvalConfig.java | 2 +-
.../tika/eval/app/ExtractComparerRunner.java | 88 +-
.../java/org/apache/tika/eval/app/TikaEvalCLI.java | 58 +
.../eval/app/reports/MarkdownSummaryWriter.java | 611 +
.../tika/eval/app/reports/ResultsReporter.java | 7 +-
.../apache/tika/eval/app/SimpleComparerTest.java | 18 +-
.../org/apache/tika/eval/app/TikaEvalCLITest.java | 13 +-
.../resources/test-dirs/extractsB/file1.pdf.json | 2 +-
.../src/main/resources/common_tokens/ace | 1578 +
.../src/main/resources/common_tokens/afr | 60006 +++++++++---------
.../src/main/resources/common_tokens/aka | 9272 +++
.../src/main/resources/common_tokens/alt | 2668 +
.../src/main/resources/common_tokens/amh | 19139 +++---
.../src/main/resources/common_tokens/ami | 2574 +
.../src/main/resources/common_tokens/ara | 60008 +++++++++----------
.../src/main/resources/common_tokens/arg | 32637 +++++++---
.../src/main/resources/common_tokens/arz | 17123 ------
.../src/main/resources/common_tokens/asm | 10793 ++--
.../src/main/resources/common_tokens/ast | 30020 ----------
.../src/main/resources/common_tokens/ava | 2453 +
.../src/main/resources/common_tokens/avk | 6710 +++
.../src/main/resources/common_tokens/azb | 26484 ++++++++
.../src/main/resources/common_tokens/aze | 60008 +++++++++----------
.../src/main/resources/common_tokens/bak | 59988 +++++++++---------
.../src/main/resources/common_tokens/bam | 1192 -
.../src/main/resources/common_tokens/ban | 12613 ++--
.../src/main/resources/common_tokens/bar | 31854 +++++-----
.../src/main/resources/common_tokens/bcl | 14470 ++++-
.../src/main/resources/common_tokens/be-x-old | 30020 ++++++++++
.../src/main/resources/common_tokens/bel | 60008 +++++++++----------
.../src/main/resources/common_tokens/ben | 44061 +++++---------
.../src/main/resources/common_tokens/bjn | 8705 +--
.../src/main/resources/common_tokens/bos | 30020 ----------
.../src/main/resources/common_tokens/bpy | 1001 -
.../src/main/resources/common_tokens/bre | 47871 +++++++++------
.../src/main/resources/common_tokens/bua | 2734 -
.../src/main/resources/common_tokens/bul | 60008 +++++++++----------
.../src/main/resources/common_tokens/bxr | 3556 ++
.../src/main/resources/common_tokens/cat | 60008 +++++++++----------
.../src/main/resources/common_tokens/cdo-x-rom | 633 +
.../src/main/resources/common_tokens/ceb | 60002 +++++++++---------
.../src/main/resources/common_tokens/ces | 60008 +++++++++----------
.../src/main/resources/common_tokens/che | 36578 +++++++++--
.../src/main/resources/common_tokens/chv | 24222 +++++---
.../src/main/resources/common_tokens/ckb | 46397 ++++++++------
.../src/main/resources/common_tokens/cnh | 17224 ++++++
.../src/main/resources/common_tokens/cor | 3558 ++
.../src/main/resources/common_tokens/cos | 10474 +++-
.../src/main/resources/common_tokens/csb | 3585 +-
.../src/main/resources/common_tokens/cym | 50185 +++++++++-------
.../src/main/resources/common_tokens/dag | 4433 ++
.../src/main/resources/common_tokens/dan | 60008 +++++++++----------
.../src/main/resources/common_tokens/deu | 60008 +++++++++----------
.../src/main/resources/common_tokens/diq | 11793 ++--
.../src/main/resources/common_tokens/div | 31671 +---------
.../src/main/resources/common_tokens/dsb | 3542 +-
.../src/main/resources/common_tokens/ell | 60008 +++++++++----------
.../src/main/resources/common_tokens/eml | 5000 --
.../src/main/resources/common_tokens/eng | 60008 +++++++++----------
.../src/main/resources/common_tokens/epo | 60008 +++++++++----------
.../src/main/resources/common_tokens/est | 60008 +++++++++----------
.../src/main/resources/common_tokens/eus | 59994 +++++++++---------
.../src/main/resources/common_tokens/ewe | 12375 +++-
.../src/main/resources/common_tokens/ext | 5545 +-
.../src/main/resources/common_tokens/fao | 39567 +++---------
.../src/main/resources/common_tokens/fas | 60008 +++++++++----------
.../src/main/resources/common_tokens/fin | 60008 +++++++++----------
.../src/main/resources/common_tokens/fra | 60008 +++++++++----------
.../src/main/resources/common_tokens/frr | 6648 +-
.../src/main/resources/common_tokens/fry | 59942 +++++++++---------
.../src/main/resources/common_tokens/gla | 6667 ++
.../src/main/resources/common_tokens/gle | 52183 ++++++++--------
.../src/main/resources/common_tokens/glg | 59850 +++++++++---------
.../src/main/resources/common_tokens/glv | 6204 +-
.../src/main/resources/common_tokens/gom | 13378 ++---
.../src/main/resources/common_tokens/grn | 7802 +--
.../src/main/resources/common_tokens/gsw | 59486 +++++++++---------
.../src/main/resources/common_tokens/guj | 24230 ++------
.../src/main/resources/common_tokens/hak-x-rom | 693 +
.../src/main/resources/common_tokens/hat | 6627 --
.../src/main/resources/common_tokens/hau | 29608 ++++++---
.../src/main/resources/common_tokens/hbs | 30020 ----------
.../src/main/resources/common_tokens/heb | 60008 +++++++++----------
.../src/main/resources/common_tokens/hif | 2137 -
.../src/main/resources/common_tokens/hil | 18496 ++++++
.../src/main/resources/common_tokens/hin | 36421 ++---------
.../src/main/resources/common_tokens/hrv | 60008 +++++++++----------
.../src/main/resources/common_tokens/hsb | 13309 ++--
.../src/main/resources/common_tokens/hun | 60008 +++++++++----------
.../src/main/resources/common_tokens/hye | 60008 +++++++++----------
.../src/main/resources/common_tokens/hyw | 19125 ++++++
.../src/main/resources/common_tokens/ibo | 19379 +++++-
.../src/main/resources/common_tokens/ido | 21997 ++++---
.../src/main/resources/common_tokens/ile | 4013 +-
.../src/main/resources/common_tokens/ilo | 10135 ++--
.../src/main/resources/common_tokens/ina | 15743 ++---
.../src/main/resources/common_tokens/ind | 60008 +++++++++----------
.../src/main/resources/common_tokens/isl | 60008 +++++++++----------
.../src/main/resources/common_tokens/ita | 60008 +++++++++----------
.../src/main/resources/common_tokens/jav | 59433 +++++++++---------
.../src/main/resources/common_tokens/jbo | 1071 +
.../src/main/resources/common_tokens/jpn | 60008 +++++++++----------
.../src/main/resources/common_tokens/kaa | 4617 ++
.../src/main/resources/common_tokens/kab | 3263 +
.../src/main/resources/common_tokens/kal | 6341 --
.../src/main/resources/common_tokens/kan | 58858 +++++++++---------
.../src/main/resources/common_tokens/kat | 60006 +++++++++---------
.../src/main/resources/common_tokens/kaz | 59998 +++++++++---------
.../src/main/resources/common_tokens/kha | 9653 +++
.../src/main/resources/common_tokens/khk | 4187 --
.../src/main/resources/common_tokens/khm | 8745 +++
.../src/main/resources/common_tokens/kin | 15225 ++---
.../src/main/resources/common_tokens/kir | 60004 +++++++++---------
.../src/main/resources/common_tokens/koi | 1373 -
.../src/main/resources/common_tokens/kom | 2382 -
.../src/main/resources/common_tokens/kor | 60008 +++++++++----------
.../src/main/resources/common_tokens/kpv | 3445 ++
.../src/main/resources/common_tokens/krc | 1974 -
.../src/main/resources/common_tokens/ksh | 5117 +-
.../src/main/resources/common_tokens/kur | 32152 ++++++----
.../src/main/resources/common_tokens/lad | 1681 -
.../src/main/resources/common_tokens/lao | 1479 +-
.../src/main/resources/common_tokens/lat | 59988 +++++++++---------
.../src/main/resources/common_tokens/lav | 60008 +++++++++----------
.../src/main/resources/common_tokens/lez | 3913 ++
.../src/main/resources/common_tokens/lfn | 5582 ++
.../src/main/resources/common_tokens/lim | 48870 ++++++---------
.../src/main/resources/common_tokens/lit | 60008 +++++++++----------
.../src/main/resources/common_tokens/lmo | 6924 ---
.../src/main/resources/common_tokens/ltz | 60006 +++++++++---------
.../src/main/resources/common_tokens/lug | 35119 ++---------
.../src/main/resources/common_tokens/lup | 905 -
.../src/main/resources/common_tokens/lus | 40004 ++++++++----
.../src/main/resources/common_tokens/mai | 755 -
.../src/main/resources/common_tokens/mal | 59240 +++++++++---------
.../src/main/resources/common_tokens/mar | 39628 +++---------
.../src/main/resources/common_tokens/mhr | 9769 +--
.../src/main/resources/common_tokens/min | 29209 +++++----
.../src/main/resources/common_tokens/mkd | 60008 +++++++++----------
.../src/main/resources/common_tokens/mlg | 38614 +++++++++---
.../src/main/resources/common_tokens/mlt | 47776 ++++++---------
.../src/main/resources/common_tokens/mon | 57279 +++++++++---------
.../src/main/resources/common_tokens/mri | 9729 ---
.../src/main/resources/common_tokens/mrj | 3902 +-
.../src/main/resources/common_tokens/msa | 60006 +++++++++---------
.../src/main/resources/common_tokens/mwl | 25861 ++++----
.../src/main/resources/common_tokens/mya | 30020 ++++++++++
.../src/main/resources/common_tokens/myv | 3154 +-
.../src/main/resources/common_tokens/mzn | 10405 ++--
.../src/main/resources/common_tokens/nan | 6673 ---
.../src/main/resources/common_tokens/nap | 2039 -
.../src/main/resources/common_tokens/nav | 533 -
.../src/main/resources/common_tokens/ndo | 3142 -
.../src/main/resources/common_tokens/nds | 54491 +++++++++--------
.../src/main/resources/common_tokens/nep | 35836 ++---------
.../src/main/resources/common_tokens/new | 2545 -
.../src/main/resources/common_tokens/nld | 60008 +++++++++----------
.../src/main/resources/common_tokens/nno | 60006 +++++++++---------
.../src/main/resources/common_tokens/nob | 60008 +++++++++----------
.../src/main/resources/common_tokens/nqo | 2779 +
.../src/main/resources/common_tokens/nso | 5150 +-
.../src/main/resources/common_tokens/nya | 30020 ++++++++++
.../src/main/resources/common_tokens/olo | 2220 +
.../src/main/resources/common_tokens/ori | 12884 ++--
.../src/main/resources/common_tokens/orm | 31991 +++++++++-
.../src/main/resources/common_tokens/oss | 7575 ++-
.../src/main/resources/common_tokens/pam | 6310 +-
.../src/main/resources/common_tokens/pan | 11564 +---
.../src/main/resources/common_tokens/pap | 12598 ++--
.../src/main/resources/common_tokens/pfl | 4325 +-
.../src/main/resources/common_tokens/pms | 6552 --
.../src/main/resources/common_tokens/pnb | 57576 +++++++++---------
.../src/main/resources/common_tokens/pol | 60008 +++++++++----------
.../src/main/resources/common_tokens/por | 60008 +++++++++----------
.../src/main/resources/common_tokens/prs | 12167 ----
.../src/main/resources/common_tokens/pus | 51255 +++++++++-------
.../src/main/resources/common_tokens/que | 2170 -
.../src/main/resources/common_tokens/roh | 33539 ++++-------
.../src/main/resources/common_tokens/ron | 60008 +++++++++----------
.../src/main/resources/common_tokens/rue | 6615 +-
.../src/main/resources/common_tokens/run | 3534 --
.../src/main/resources/common_tokens/rus | 60008 +++++++++----------
.../src/main/resources/common_tokens/sah | 31037 +++++-----
.../src/main/resources/common_tokens/san | 14998 ++---
.../src/main/resources/common_tokens/sat | 6387 ++
.../src/main/resources/common_tokens/scn | 7559 ---
.../src/main/resources/common_tokens/sco | 12070 ----
.../src/main/resources/common_tokens/sgs | 5547 +-
.../src/main/resources/common_tokens/sin | 34762 +++++------
.../src/main/resources/common_tokens/skr | 6326 ++
.../src/main/resources/common_tokens/slk | 60008 +++++++++----------
.../src/main/resources/common_tokens/slv | 60008 +++++++++----------
.../src/main/resources/common_tokens/sme | 5891 +-
.../src/main/resources/common_tokens/smi | 1676 -
.../src/main/resources/common_tokens/smn | 2934 +
.../src/main/resources/common_tokens/smo | 24490 ++++++++
.../src/main/resources/common_tokens/sna | 29768 ++-------
.../src/main/resources/common_tokens/snd | 26767 +++++----
.../src/main/resources/common_tokens/som | 32074 +++-------
.../src/main/resources/common_tokens/sot | 3535 --
.../src/main/resources/common_tokens/spa | 60008 +++++++++----------
.../src/main/resources/common_tokens/sqi | 60008 +++++++++----------
.../src/main/resources/common_tokens/srd | 3796 --
.../src/main/resources/common_tokens/srp | 60008 +++++++++----------
.../src/main/resources/common_tokens/ssw | 2035 -
.../src/main/resources/common_tokens/stq | 3423 ++
.../src/main/resources/common_tokens/sun | 45923 ++++++--------
.../src/main/resources/common_tokens/swe | 60006 +++++++++---------
.../src/main/resources/common_tokens/swh | 28444 +++++++--
.../src/main/resources/common_tokens/szl | 8763 ++-
.../src/main/resources/common_tokens/szy | 4825 ++
.../src/main/resources/common_tokens/tam | 44673 +++++---------
.../src/main/resources/common_tokens/tat | 60008 +++++++++----------
.../src/main/resources/common_tokens/tay | 1220 +
.../src/main/resources/common_tokens/tel | 44436 +++++---------
.../src/main/resources/common_tokens/tet | 24501 ++++++++
.../src/main/resources/common_tokens/tgk | 57085 +++++++++---------
.../src/main/resources/common_tokens/tgl | 59972 +++++++++---------
.../src/main/resources/common_tokens/tha | 53250 +++++++---------
.../src/main/resources/common_tokens/tir | 30020 ++++++++++
.../src/main/resources/common_tokens/trv | 3333 +
.../src/main/resources/common_tokens/tsn | 17815 ++++--
.../src/main/resources/common_tokens/tso | 15810 +++--
.../src/main/resources/common_tokens/tuk | 30868 ++++------
.../src/main/resources/common_tokens/tum | 4881 ++
.../src/main/resources/common_tokens/tur | 60008 +++++++++----------
.../src/main/resources/common_tokens/tyv | 8541 ++-
.../src/main/resources/common_tokens/udm | 31389 +++++++++-
.../src/main/resources/common_tokens/uig | 31403 +++++-----
.../src/main/resources/common_tokens/ukr | 60008 +++++++++----------
.../src/main/resources/common_tokens/urd | 60006 +++++++++---------
.../src/main/resources/common_tokens/uzb | 59970 +++++++++---------
.../src/main/resources/common_tokens/uzn | 30020 ----------
.../src/main/resources/common_tokens/ven | 2457 -
.../src/main/resources/common_tokens/vep | 7276 +++
.../src/main/resources/common_tokens/vie | 51687 +++++++---------
.../src/main/resources/common_tokens/vls | 15292 ++---
.../src/main/resources/common_tokens/vol | 7216 +--
.../src/main/resources/common_tokens/vro | 4730 +-
.../src/main/resources/common_tokens/war | 55938 ++++++++---------
.../src/main/resources/common_tokens/wln | 12282 ++--
.../src/main/resources/common_tokens/wuu | 30020 ----------
.../src/main/resources/common_tokens/xho | 51611 +++++++++-------
.../src/main/resources/common_tokens/xmf | 15787 +++--
.../src/main/resources/common_tokens/ydd | 21613 ++++---
.../src/main/resources/common_tokens/yor | 7833 ++-
.../src/main/resources/common_tokens/yue | 9176 +++
.../src/main/resources/common_tokens/zea | 2318 -
.../src/main/resources/common_tokens/zho | 60008 +++++++++----------
.../src/main/resources/common_tokens/zul | 35021 ++---------
.../core/metadata/TikaEvalMetadataFilterTest.java | 13 +-
.../tika/eval/core/textstats/TextStatsTest.java | 28 +-
.../core/tokens/tools/CommonTokenGenerator.java | 102 +-
tika-example/pom.xml | 2 +-
.../java/org/apache/tika/example/Language.java | 5 +-
.../tika/example/LanguageDetectorExample.java | 3 +-
.../java/org/apache/tika/example/MyFirstTika.java | 3 +-
.../tika/example/PipesForkParserExample.java | 10 +-
.../tika/example/LanguageDetectorExampleTest.java | 2 +-
tika-grpc/dev-tika-config.json | 3 +-
tika-grpc/pom.xml | 22 +-
tika-grpc/run-dev.sh | 15 +-
.../org/apache/tika/pipes/grpc/TikaGrpcServer.java | 7 +-
.../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 39 +-
tika-grpc/src/main/proto/tika.proto | 2 +
.../src/test/resources/tika-config-ignite.json | 2 +-
.../src/test/resources/tika-pipes-test-config.json | 14 +-
tika-integration-tests/pom.xml | 1 +
.../pom.xml | 19 +-
.../elasticsearch/tests/ElasticsearchTest.java | 740 +
.../tests/ElasticsearchTestClient.java} | 49 +-
.../elasticsearch/elasticsearch-mappings.json} | 0
.../elasticsearch-parent-child-mappings.json} | 0
.../elasticsearch-vector-mappings.json | 17 +
.../resources/elasticsearch}/plugins-template.json | 46 +-
.../resources/pipes-fork-server-custom-log4j2.xml | 0
.../src/test/resources/test-documents/fake_oom.xml | 0
.../src/test/resources/test-documents/npe.xml | 0
.../src/test/resources/test-documents/oom.xml | 0
.../test-documents}/test_recursive_embedded.docx | Bin
.../tika/pipes/kafka/tests/TikaPipesKafkaTest.java | 5 +-
.../src/test/resources/kafka/plugins-template.json | 1 -
.../resources/opensearch/plugins-template.json | 11 +-
.../opensearch/tika-config-opensearch.json | 11 +-
.../src/test/resources/s3/plugins-template.json | 1 -
.../pipes/solr/tests/TikaPipesSolrTestBase.java | 60 +-
.../src/test/resources/solr/plugins-template.json | 11 +-
.../src/test/resources/tika-config-solr-urls.json | 5 +-
tika-langdetect/pom.xml | 3 +-
.../charsoup/CharSoupFeatureExtractor.java | 12 +-
.../tika/langdetect/charsoup/CharSoupModel.java | 126 +-
.../tika/langdetect/charsoup/FeatureExtractor.java | 39 +
.../charsoup/ScriptAwareFeatureExtractor.java | 252 +-
.../tika/langdetect/charsoup/ScriptCategory.java | 13 +-
.../charsoup/ShortTextFeatureExtractor.java | 348 +
.../charsoup/langdetect-short-v1-20260310.bin | Bin 0 -> 3999308 bytes
.../langdetect/charsoup/langdetect-v7-20260306.bin | Bin 0 -> 3328628 bytes
.../apache/tika/langdetect/charsoup/langdetect.bin | Bin 1641016 -> 0 bytes
tika-langdetect/tika-langdetect-charsoup/pom.xml | 23 +-
.../charsoup/CharSoupDetectorConfig.java | 120 +
.../charsoup/CharSoupEncodingDetector.java | 208 -
.../charsoup/CharSoupLanguageDetector.java | 599 +-
.../charsoup/CharSoupMetadataFilter.java} | 36 +-
.../tika/langdetect/charsoup/ConfusableGroups.java | 72 +
.../langdetect/charsoup/LanguageConfusables.java | 195 -
.../src/main/python/extract_madlad_to_wiki.py | 182 +
.../tika/langdetect/charsoup/confusables.txt | 37 +
.../charsoup/CharSoupDetectorConfigTest.java | 125 +
.../charsoup/CharSoupFeatureExtractorTest.java | 2 +-
.../charsoup/CharSoupModelRoutingTest.java | 281 +
.../langdetect/charsoup/LangIdRegressionTest.java | 25 +-
.../tika/langdetect/charsoup/LinearModelTest.java | 2 +-
.../charsoup/ScriptAwareFeatureExtractorTest.java | 33 +-
.../langdetect/charsoup/SjisLangSignalTest.java | 253 +
.../langdetect/charsoup/tools/AblationRunner.java | 652 +-
.../charsoup/tools/BucketSaturationAnalyzer.java | 76 +-
.../charsoup/tools/CalibrateConfidence.java | 217 +
.../charsoup/tools/CompareDetectors.java | 1470 +-
.../langdetect/charsoup/tools/ConfusionDump.java | 184 +
.../langdetect/charsoup/tools/CorpusReader.java | 73 +-
.../langdetect/charsoup/tools/CrossDomainEval.java | 309 +-
.../charsoup/tools/DiagnoseUnknownScript.java | 145 +
.../charsoup/tools/KoreanFalsePositives.java | 146 +
.../langdetect/charsoup/tools/ModelQuantizer.java | 28 +-
.../langdetect/charsoup/tools/Phase2SmokeTest.java | 2 +-
.../langdetect/charsoup/tools/Phase2Trainer.java | 102 +
.../langdetect/charsoup/tools/PrepareCorpus.java | 992 +
.../langdetect/charsoup/tools/QuickF1Eval.java | 134 +-
.../charsoup/tools/ResearchFeatureExtractor.java | 457 +
.../charsoup/tools/TrainLanguageModel.java | 982 +-
.../langdetect/charsoup/tools/TrainShortModel.java | 179 +
.../src/test/python/check_script_consistency.py | 253 +
.../src/test/python/clean_madlad.py | 308 +
.../src/test/python/collect_wikipedia.py | 556 +
.../src/test/python/diagnose_kor_eng.py | 256 +
.../src/test}/python/download_madlad.py | 0
.../src/test/python/eval_fasttext.py | 290 +
.../src/test/python/filter_contamination.py | 240 +
.../src/test/python/filter_pashto.py | 66 +-
.../src/test/python/filter_uppercase.py | 130 +
.../src/test/python/summarize_wikipedia.py | 170 +
tika-langdetect/tika-langdetect-optimaize/pom.xml | 2 +-
tika-langdetect/tika-langdetect-tika/pom.xml | 75 -
.../tika/langdetect/tika/LanguageIdentifier.java | 260 -
.../tika/langdetect/tika/LanguageProfile.java | 317 -
.../langdetect/tika/LanguageProfilerBuilder.java | 767 -
.../tika/langdetect/tika/ProfilingWriter.java | 103 -
.../tika/langdetect/tika/TikaLanguageDetector.java | 92 -
...rg.apache.tika.language.detect.LanguageDetector | 16 -
.../org/apache/tika/langdetect/tika/be.ngp | 1014 -
.../org/apache/tika/langdetect/tika/ca.ngp | 1014 -
.../org/apache/tika/langdetect/tika/da.ngp | 1014 -
.../org/apache/tika/langdetect/tika/de.ngp | 1014 -
.../org/apache/tika/langdetect/tika/el.ngp | 1014 -
.../org/apache/tika/langdetect/tika/en.ngp | 1014 -
.../org/apache/tika/langdetect/tika/eo.ngp | 1014 -
.../org/apache/tika/langdetect/tika/es.ngp | 1014 -
.../org/apache/tika/langdetect/tika/et.ngp | 1014 -
.../org/apache/tika/langdetect/tika/fa.ngp | 1015 -
.../org/apache/tika/langdetect/tika/fi.ngp | 1014 -
.../org/apache/tika/langdetect/tika/fr.ngp | 1014 -
.../org/apache/tika/langdetect/tika/gl.ngp | 1014 -
.../org/apache/tika/langdetect/tika/hu.ngp | 1014 -
.../org/apache/tika/langdetect/tika/is.ngp | 1014 -
.../org/apache/tika/langdetect/tika/it.ngp | 1014 -
.../org/apache/tika/langdetect/tika/lt.ngp | 1209 -
.../org/apache/tika/langdetect/tika/nl.ngp | 1014 -
.../org/apache/tika/langdetect/tika/no.ngp | 1014 -
.../org/apache/tika/langdetect/tika/pl.ngp | 1014 -
.../org/apache/tika/langdetect/tika/pt.ngp | 1014 -
.../org/apache/tika/langdetect/tika/ro.ngp | 1014 -
.../org/apache/tika/langdetect/tika/ru.ngp | 1014 -
.../org/apache/tika/langdetect/tika/sk.ngp | 1014 -
.../org/apache/tika/langdetect/tika/sl.ngp | 1014 -
.../org/apache/tika/langdetect/tika/sv.ngp | 1014 -
.../org/apache/tika/langdetect/tika/th.ngp | 1014 -
.../tika/langdetect/tika/tika.language.properties | 56 -
.../org/apache/tika/langdetect/tika/uk.ngp | 1014 -
.../langdetect/tika/LanguageIdentifierTest.java | 185 -
.../tika/langdetect/tika/LanguageProfileTest.java | 58 -
.../tika/LanguageProfilerBuilderTest.java | 96 -
.../tika/langdetect/tika/ProfilingHandler.java | 67 -
.../tika/langdetect/tika/ProfilingWriterTest.java | 45 -
.../org/apache/tika/langdetect/tika/da.test | 108 -
.../org/apache/tika/langdetect/tika/de.test | 104 -
.../org/apache/tika/langdetect/tika/el.test | 109 -
.../org/apache/tika/langdetect/tika/en.test | 105 -
.../org/apache/tika/langdetect/tika/es.test | 107 -
.../org/apache/tika/langdetect/tika/et.test | 17 -
.../org/apache/tika/langdetect/tika/fi.test | 106 -
.../org/apache/tika/langdetect/tika/fr.test | 105 -
.../org/apache/tika/langdetect/tika/it.test | 109 -
.../langdetect/tika/langbuilder/welsh_corpus.txt | 2602 -
.../org/apache/tika/langdetect/tika/lt.test | 32 -
.../org/apache/tika/langdetect/tika/nl.test | 105 -
.../org/apache/tika/langdetect/tika/pt.test | 105 -
.../org/apache/tika/langdetect/tika/sv.test | 108 -
tika-ml/tika-ml-chardetect/README.md | 76 +
tika-ml/tika-ml-chardetect/pom.xml | 16 +
.../tika/ml/chardetect/MlEncodingDetector.java | 201 -
.../tools/BenchmarkCharsetDetectors.java | 12 +-
.../chardetect/tools/BuildCharsetTrainingData.java | 1030 +
.../ConfigurableByteNgramFeatureExtractor.java | 254 +
.../chardetect}/tools/DiagnoseCharsetDetector.java | 32 +-
.../ml/chardetect/tools/EvalCharsetDetectors.java | 424 +
.../ml/chardetect}/tools/TrainCharsetModel.java | 158 +-
.../tika-ml-chardetect/src/test/python/anneal.py | 379 +
.../charsoup/CharSoupFeatureExtractor.java | 456 -
.../charsoup/ScriptAwareFeatureExtractor.java | 399 -
.../tika/langdetect/charsoup/ScriptCategory.java | 117 -
.../tika/langdetect/charsoup/WordTokenizer.java | 225 -
.../main/java/org/apache/tika/ml/LinearModel.java | 32 +-
tika-parent/pom.xml | 155 +-
tika-parsers/pom.xml | 2 +
.../pom.xml | 46 +-
.../java/org/apache/tika/http/TikaHttpClient.java | 150 +
.../org/apache/tika/http/TikaTestHttpServer.java | 268 +
.../org/apache/tika/parser/gdal/GDALParser.java | 6 +-
tika-parsers/tika-parsers-ml/pom.xml | 2 +-
.../tika-parsers-ml/tika-inference/pom.xml | 48 +-
.../tika/inference/AbstractEmbeddingFilter.java | 28 +-
.../org/apache/tika/inference/ChunkSerializer.java | 11 +-
.../tika/inference/ImageEmbeddingConfig.java | 3 +-
.../org/apache/tika/inference/InferenceConfig.java | 23 +-
.../tika/inference/OpenAIEmbeddingFilter.java | 64 +-
.../tika/inference/OpenAIImageEmbeddingParser.java | 97 +-
.../apache/tika/inference/VectorSerializer.java | 22 +-
.../tika/inference/OpenAIEmbeddingFilterTest.java | 70 +-
.../inference/OpenAIImageEmbeddingParserTest.java | 131 +-
.../tika/inference/VectorSerializerTest.java | 13 +
.../tika-parsers-ml/tika-parser-nlp-module/pom.xml | 2 +-
.../apache/tika/parser/ner/NamedEntityParser.java | 2 +
.../src/test/resources/configs/tika-config.json | 2 +-
.../tika-parser-tess4j-module/pom.xml | 2 +-
.../tika/parser/ocr/tess4j/Tess4JParser.java | 6 +-
.../pom.xml | 53 +-
.../apache/tika/parser/vlm/AbstractVLMParser.java | 107 +-
.../apache/tika/parser/vlm/ClaudeVLMParser.java | 21 +-
.../apache/tika/parser/vlm/GeminiVLMParser.java | 14 +-
.../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 0
.../apache/tika/parser/vlm/OpenAIVLMParser.java | 23 +-
.../org/apache/tika/parser/vlm/VLMOCRConfig.java | 17 +
.../tika/parser/vlm/ClaudeVLMParserTest.java | 62 +-
.../tika/parser/vlm/GeminiVLMParserTest.java | 64 +-
.../parser/vlm/MarkdownToXHTMLEmitterTest.java | 0
.../tika/parser/vlm/OpenAIVLMParserTest.java | 69 +-
.../tika-parser-html-module/pom.xml | 17 +-
.../org/apache/tika/parser/html/JSoupParser.java | 7 +-
.../tika/parser/html/HtmlEncodingDetectorTest.java | 7 +-
.../apache/tika/parser/html/HtmlParserTest.java | 36 +-
.../html/StandardHtmlEncodingDetectorTest.java | 10 +-
.../tika-parser-microsoft-module/pom.xml | 11 +-
.../tika/parser/microsoft/OutlookExtractor.java | 7 +-
.../microsoft/POIContainerExtractionTest.java | 4 +-
.../tika-parser-miscoffice-module/pom.xml | 16 +-
.../java/org/apache/tika/parser/dbf/DBFParser.java | 7 +-
.../apache/tika/parser/ocr/TesseractOCRParser.java | 6 +-
.../tika/parser/ocr/TesseractOCRParserTest.java | 4 +-
.../apache/tika/parser/pdf/AbstractPDF2XHTML.java | 7 +
.../java/org/apache/tika/parser/pkg/ZipParser.java | 18 +-
.../tika-parser-text-module/pom.xml | 18 +-
.../apache/tika/parser/strings/StringsParser.java | 11 +-
.../tika/parser/csv/TextAndCSVParserTest.java | 12 +-
.../org/apache/tika/parser/txt/TXTParserTest.java | 103 +-
.../apache/tika/parser/xmp/XMPPacketScanner.java | 7 +-
.../tika/parser/xmp/XmpboxExtractorTest.java | 2 -
.../tika-parsers-standard-package/pom.xml | 42 +-
.../tika/config/TikaEncodingDetectorTest.java | 178 +-
.../apache/tika/parser/AutoDetectParserTest.java | 6 +-
.../tika/parser/microsoft/rtf/RTFParserTest.java | 6 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 35 +-
.../apache/tika/parser/pkg/PackageParserTest.java | 3 +
...IKA-2273-exclude-encoding-detector-default.json | 2 +-
.../TIKA-2273-parameterize-encoding-detector.json | 4 +-
.../TIKA-2485-encoding-detector-mark-limits.json | 13 +-
.../tika/async/cli/FileListPipesIterator.java | 122 +
.../org/apache/tika/async/cli/PluginsWriter.java | 119 +-
.../org/apache/tika/async/cli/TikaAsyncCLI.java | 102 +-
.../apache/tika/async/cli/AsyncCliParserTest.java | 44 +-
.../tika/async/cli/FileListPipesIteratorTest.java | 103 +
.../test/resources/configs/config-template.json | 6 +-
.../org/apache/tika/client/HttpClientFactory.java | 85 +-
tika-pipes/tika-pipes-api/pom.xml | 19 +
.../java/org/apache/tika/pipes/api/ParseMode.java | 3 +
tika-pipes/tika-pipes-config-store-ignite/pom.xml | 68 +-
.../tika/pipes/ignite/ExtensionConfigDTO.java | 29 +-
.../tika/pipes/ignite/IgniteConfigStore.java | 182 +-
.../ignite/config/IgniteConfigStoreConfig.java | 59 +-
.../pipes/ignite/server/IgniteStoreServer.java | 201 +-
.../tika/pipes/ignite/IgniteConfigStoreTest.java | 119 +-
.../org/apache/tika/pipes/core/PipesClient.java | 41 +-
.../org/apache/tika/pipes/core/PipesConfig.java | 15 -
.../tika/pipes/core/async/AsyncProcessor.java | 33 +-
.../tika/pipes/core/config/ConfigMerger.java | 22 +-
.../tika/pipes/core/config/ConfigOverrides.java | 39 +-
.../tika/pipes/core/protocol/PipesMessage.java | 16 +-
.../tika/pipes/core/server/ConnectionHandler.java | 77 +-
.../apache/tika/pipes/core/server/EmitHandler.java | 2 +-
.../core/server/MetadataListAndEmbeddedBytes.java | 5 +-
.../apache/tika/pipes/core/server/PipesServer.java | 44 +-
.../tika/pipes/core/config/ConfigMergerTest.java | 11 +-
.../tika/pipes/core/protocol/PipesMessageTest.java | 2 +-
.../apache/tika/pipes/fork/PipesForkParser.java | 6 +-
.../tika/pipes/fork/PipesForkParserConfig.java | 22 +-
.../tika/pipes/fork/PipesForkParserTest.java | 52 +-
.../filter/AttachmentCountingListFilter.java | 3 +-
.../apache/tika/pipes/core/CrashingDetector.java | 2 +
.../apache/tika/pipes/core/PipesClientTest.java | 30 +-
.../resources/configs/tika-config-bad-class.json | 6 +-
.../configs/tika-config-bad-java-path.json | 6 +-
.../configs/tika-config-bad-jvm-args.json | 6 +-
.../test/resources/configs/tika-config-basic.json | 8 +-
.../configs/tika-config-crashing-detector.json | 8 +-
.../resources/configs/tika-config-emit-all.json | 6 +-
.../resources/configs/tika-config-passback.json | 8 +-
.../configs/tika-config-shared-server.json | 8 +-
.../configs/tika-config-timeout-lt-heartbeat.json | 6 +-
.../resources/configs/tika-config-truncate.json | 4 +-
.../resources/configs/tika-config-uppercasing.json | 8 +-
.../configs/tika-config-write-limiter.json | 4 +-
tika-pipes/tika-pipes-plugins/pom.xml | 1 +
.../fetcher/atlassianjwt/AtlassianJwtFetcher.java | 14 +-
.../atlassianjwt/AtlassianJwtFetcherFactory.java | 4 +-
.../config/AtlassianJwtFetcherConfig.java | 40 +-
.../pom.xml | 27 +-
.../src/main/assembly/assembly.xml | 0
.../org/apache/tika/pipes/emitter/es/ESClient.java | 395 +
.../apache/tika/pipes/emitter/es/ESEmitter.java} | 89 +-
.../tika/pipes/emitter/es/ESEmitterConfig.java | 74 +
.../tika/pipes/emitter/es/ESEmitterFactory.java} | 22 +-
.../tika/pipes/emitter/es/HttpClientConfig.java | 39 +
.../tika/pipes/emitter/es}/JsonResponse.java | 3 +-
.../tika/pipes/plugin/es/ESPipesPlugin.java} | 14 +-
.../tika/pipes/reporter/es/ESPipesReporter.java | 251 +
.../tika/pipes/reporter/es/ESReporterConfig.java} | 17 +-
.../tika/pipes/reporter/es/ESReporterFactory.java} | 24 +-
.../src/main/resources/plugin.properties | 6 +-
.../apache/tika/pipes/emitter/es/ESClientTest.java | 165 +
.../tika-pipes-google-drive/pom.xml | 6 +-
.../tika/pipes/fetcher/http/HttpFetcher.java | 14 +-
.../pipes/fetcher/http/HttpFetcherFactory.java | 4 +-
.../fetcher/http/config/HttpFetcherConfig.java | 40 +-
.../tika/pipes/fetcher/http/HttpFetcherTest.java | 8 +-
.../tika-pipes-microsoft-graph/pom.xml | 4 +-
.../pipes/emitter/opensearch/HttpClientConfig.java | 2 +-
.../pipes/emitter/opensearch/OpenSearchClient.java | 43 +-
.../emitter/opensearch/OpenSearchEmitter.java | 4 +-
.../reporter/opensearch/HttpClientConfig.java | 2 +-
.../opensearch/OpenSearchPipesReporter.java | 4 +-
.../tika/pipes/emitter/solr/SolrEmitter.java | 8 +-
.../tika/pipes/emitter/solr/SolrEmitterConfig.java | 12 +-
.../pipes/iterator/solr/SolrPipesIterator.java | 8 +-
.../iterator/solr/SolrPipesIteratorConfig.java | 20 +-
.../tika/config/loader/ComponentInstantiator.java | 157 +-
.../config/loader/TikaObjectMapperFactory.java | 29 +
.../tika/serialization/ComponentNameResolver.java | 104 +-
.../tika/serialization/JsonMetadataList.java | 2 +-
.../tika/serialization/ParseContextUtils.java | 36 +-
.../org/apache/tika/serialization/TikaModule.java | 238 +-
.../serdes/ParseContextDeserializer.java | 47 +-
.../serdes/ParseContextSerializer.java | 42 +-
.../java/org/apache/tika/config/AllLimitsTest.java | 11 +-
.../org/apache/tika/config/TimeoutLimitsTest.java | 55 +-
.../tika/config/loader/ConfigLoaderTest.java | 24 +-
.../filter/AttachmentCountingListFilter.java | 3 +-
.../CustomClassSerializationTest.java | 2 +-
.../serialization/RoundTripSerializationTest.java | 38 +-
.../TestParseContextSerialization.java | 49 +-
.../test/resources/configs/all-limits-test.json | 3 +-
.../test/resources/configs/test-config-loader.json | 2 +-
.../resources/configs/test-partial-config.json | 2 +-
.../resources/configs/timeout-limits-test.json | 3 +-
tika-server/tika-server-core/pom.xml | 2 +-
.../apache/tika/server/core/TikaServerProcess.java | 33 +-
.../server/core/resource/LanguageResource.java | 24 +-
.../tika/server/core/resource/PipesResource.java | 4 +-
.../server/core/resource/TranslateResource.java | 4 +-
.../org/apache/tika/server/core/CXFTestBase.java | 8 +-
.../tika/server/core/LanguageResourceTest.java | 10 +-
.../core/TikaServerPipesIntegrationTest.java | 2 +-
.../resources/configs/cxf-test-base-template.json | 8 +-
.../configs/cxf-unpack-test-template.json | 4 +-
.../configs/tika-config-server-basic.json | 6 +-
.../configs/tika-config-server-emitter.json | 6 +-
.../tika-config-server-fetcher-template.json | 6 +-
.../tika-config-server-fetchers-emitters.json | 6 +-
.../configs/tika-config-server-pipes-basic.json | 6 +-
.../tika-config-server-tls-one-way-template.json | 6 +-
.../tika-config-server-tls-two-way-template.json | 6 +-
.../resources/configs/tika-config-server-tls.json | 6 +-
.../test/resources/configs/tika-config-server.json | 6 +-
.../configs/tika-config-timeout-100ms.json | 4 +-
.../configs/tika-config-with-timeout.json | 4 +-
.../src/test/resources/test-documents/english.txt | 2 +-
tika-server/tika-server-standard/pom.xml | 18 +
.../resources/configs/cxf-test-base-template.json | 8 +-
.../configs/tika-config-for-server-tests.json | 6 +-
.../test/resources/configs/tika-config-json.json | 6 +-
.../tika-config-langdetect-opennlp-filter.json | 6 +-
.../tika-config-langdetect-optimaize-filter.json | 6 +-
tika-translate/pom.xml | 4 +-
.../translate/impl/AbstractTranslator.java | 24 +-
.../translate/impl/JoshuaNetworkTranslator.java | 2 +-
754 files changed, 3495444 insertions(+), 3453047 deletions(-)
delete mode 100644 .java-version
create mode 100644
docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
create mode 100644
docs/modules/ROOT/pages/advanced/charsoup-supported-languages.adoc
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/flores-AUTOMATIC.log
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/flores-SHORT_TEXT.log
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/flores-STANDARD.log
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/flores200-dev-eval.md
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/language-drop-decisions.md
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/short-text-language-decisions.md
create mode 100644
docs/modules/ROOT/pages/advanced/lang-detection/supported-languages.md
create mode 100644 docs/modules/ROOT/pages/pipes/timeouts.adoc
delete mode 100755 run-lang-train.sh
delete mode 100644 tika-charset-detectors/pom.xml
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/BomEncodingDetector.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/ByteNgramFeatureExtractor.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/FeatureExtractor.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/HttpHeaderEncodingDetector.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/LinearModel.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/MlEncodingDetector.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/Prediction.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html/StandardHtmlEncodingDetector.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/test/java/org/apache/tika/detect/encoding/ByteNgramFeatureExtractorTest.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-core/src/test/java/org/apache/tika/detect/encoding/CharsetConfusablesTest.java
delete mode 100644 tika-charset-detectors/tika-charset-detectors-icu4j/pom.xml
delete mode 100644
tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding/tools/EvalCharsetDetectors.java
delete mode 100644
tika-charset-detectors/tika-charset-detectors-tools/src/main/python/build_charset_training.py
create mode 100644
tika-core/src/main/java/org/apache/tika/config/TikaProgressTracker.java
delete mode 100644
tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt
=> tika-core/src/main/java/org/apache/tika/detect}/BOMDetector.java (69%)
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
create mode 100644
tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
rename {tika-charset-detectors/tika-charset-detectors-core =>
tika-core}/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(60%)
create mode 100644
tika-core/src/test/java/org/apache/tika/config/TikaProgressTrackerTest.java
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt
=> tika-core/src/test/java/org/apache/tika/detect}/BOMDetectorTest.java (91%)
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.xml
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ignite/README.md
delete mode 100755
tika-e2e-tests/tika-grpc/sample-configs/ner/run_tika_server.sh
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.xml
delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/test-simple.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.xml
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.json
delete mode 100644
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.xml
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/ignite/README.md
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose-ignite.yml
delete mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose.yml
delete mode 100644 tika-e2e-tests/tika-grpc/src/test/resources/log4j2.xml
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.csv
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.html
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.txt
create mode 100644
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.xml
copy tika-e2e-tests/tika-grpc/src/test/resources/{tika-config-ignite.json =>
tika-config-ignite-local.json} (90%)
copy {tika-bundles => tika-encoding-detectors}/pom.xml (60%)
rename {tika-charset-detectors/tika-charset-detectors-tools =>
tika-encoding-detectors/tika-encoding-detector-charsoup}/pom.xml (60%)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
copy
tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor
=>
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(64%)
rename {tika-langdetect/tika-langdetect-charsoup =>
tika-encoding-detectors/tika-encoding-detector-charsoup}/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
(79%)
rename {tika-langdetect/tika-langdetect-charsoup =>
tika-encoding-detectors/tika-encoding-detector-charsoup}/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
(100%)
rename {tika-charset-detectors/tika-charset-detectors-core =>
tika-encoding-detectors/tika-encoding-detector-html}/pom.xml (65%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser}/html/HtmlEncodingDetector.java
(92%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/CharsetAliases.java
(94%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/CharsetDetectionResult.java
(97%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/MetaProcessor.java
(95%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/PreScanner.java
(93%)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/charsets/ReplacementCharset.java
(97%)
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/charsets/XUserDefinedCharset.java
(97%)
copy
tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
=>
tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(91%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=> tika-encoding-detectors/tika-encoding-detector-icu4j}/pom.xml (62%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
(100%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
(100%)
rename
{tika-charset-detectors/tika-charset-detectors-icu4j/src/main/java/org/apache/tika/detect/encoding
=>
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt}/Icu4jEncodingDetector.java
(77%)
rename
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
(98%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/configs/tika-config-ignore-charset.json
(100%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/multi-language.txt
(100%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/resume.html
(100%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/testIgnoreCharset.txt
(100%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/testTXT_win-1252.txt
(100%)
copy
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
=>
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/test_ignore_IBM420.html
(100%)
copy {tika-ml/tika-ml-chardetect =>
tika-encoding-detectors/tika-encoding-detector-mojibuster}/pom.xml (70%)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding
=>
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect}/CharsetConfusables.java
(69%)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
rename
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding
=>
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect}/StructuralEncodingRules.java
(62%)
copy
tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser =>
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
(71%)
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
create mode 100644
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
rename {tika-charset-detectors/tika-charset-detectors-universal =>
tika-encoding-detectors/tika-encoding-detector-universal}/pom.xml (64%)
rename
{tika-charset-detectors/tika-charset-detectors-universal/src/main/java/org/apache/tika/detect/encoding
=>
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt}/UniversalEncodingDetector.java
(53%)
rename
{tika-charset-detectors/tika-charset-detectors-universal/src/main/java/org/apache/tika/detect/encoding
=>
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt}/UniversalEncodingListener.java
(99%)
copy {tika-pipes/tika-pipes-fork-parser =>
tika-eval/tika-eval-app}/src/main/assembly/assembly.xml (92%)
create mode 100644
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ace
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/aka
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/alt
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ami
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ast
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ava
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/avk
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/azb
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/be-x-old
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bos
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/bxr
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/cdo-x-rom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/cnh
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/cor
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/dag
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hak-x-rom
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hat
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hil
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/hyw
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/jbo
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kaa
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kab
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kha
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/kpv
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lez
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lfn
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mri
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nan
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/new
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nqo
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/nya
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/olo
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/que
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/run
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sat
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/skr
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/smn
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/smo
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/srd
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ssw
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/stq
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/szy
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tay
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tet
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tir
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/trv
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/tum
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/vep
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
create mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/yue
delete mode 100644
tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
copy tika-integration-tests/{tika-pipes-opensearch-integration-tests =>
tika-pipes-es-integration-tests}/pom.xml (85%)
create mode 100644
tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
copy
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java
=>
tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTestClient.java}
(72%)
copy
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json
=>
tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-mappings.json}
(100%)
copy
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json
=>
tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-parent-child-mappings.json}
(100%)
create mode 100644
tika-integration-tests/tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-vector-mappings.json
copy
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch
=>
tika-pipes-es-integration-tests/src/test/resources/elasticsearch}/plugins-template.json
(78%)
copy tika-integration-tests/{tika-pipes-opensearch-integration-tests =>
tika-pipes-es-integration-tests}/src/test/resources/pipes-fork-server-custom-log4j2.xml
(100%)
copy tika-integration-tests/{tika-pipes-opensearch-integration-tests =>
tika-pipes-es-integration-tests}/src/test/resources/test-documents/fake_oom.xml
(100%)
copy tika-integration-tests/{tika-pipes-opensearch-integration-tests =>
tika-pipes-es-integration-tests}/src/test/resources/test-documents/npe.xml
(100%)
copy tika-integration-tests/{tika-pipes-opensearch-integration-tests =>
tika-pipes-es-integration-tests}/src/test/resources/test-documents/oom.xml
(100%)
copy {tika-app/src/test/resources/test-data =>
tika-integration-tests/tika-pipes-es-integration-tests/src/test/resources/test-documents}/test_recursive_embedded.docx
(100%)
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-short-v1-20260310.bin
create mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-v7-20260306.bin
delete mode 100644
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfig.java
delete mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
copy
tika-langdetect/{tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
=>
tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupMetadataFilter.java}
(59%)
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/ConfusableGroups.java
delete mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/LanguageConfusables.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/python/extract_madlad_to_wiki.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/main/resources/org/apache/tika/langdetect/charsoup/confusables.txt
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfigTest.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupModelRoutingTest.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/SjisLangSignalTest.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CalibrateConfidence.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ConfusionDump.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/DiagnoseUnknownScript.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/KoreanFalsePositives.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/PrepareCorpus.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainShortModel.java
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/check_script_consistency.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/clean_madlad.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/collect_wikipedia.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/diagnose_kor_eng.py
rename {tika-charset-detectors/tika-charset-detectors-tools/src/main =>
tika-langdetect/tika-langdetect-charsoup/src/test}/python/download_madlad.py
(100%)
create mode 100755
tika-langdetect/tika-langdetect-charsoup/src/test/python/eval_fasttext.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_contamination.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_uppercase.py
create mode 100644
tika-langdetect/tika-langdetect-charsoup/src/test/python/summarize_wikipedia.py
delete mode 100644 tika-langdetect/tika-langdetect-tika/pom.xml
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/META-INF/services/org.apache.tika.language.detect.LanguageDetector
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/be.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ca.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/da.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/de.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/el.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/en.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/eo.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/es.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/et.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fa.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fi.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fr.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/gl.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/hu.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/is.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/it.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/lt.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/nl.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/no.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pl.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pt.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ro.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ru.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sk.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sl.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sv.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/th.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/tika.language.properties
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/uk.ngp
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/da.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/de.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/el.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/en.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/es.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/et.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fi.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fr.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/it.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/langbuilder/welsh_corpus.txt
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/lt.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/nl.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/pt.test
delete mode 100644
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/sv.test
create mode 100644 tika-ml/tika-ml-chardetect/README.md
delete mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/MlEncodingDetector.java
rename
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
=>
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/BenchmarkCharsetDetectors.java
(94%)
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
rename
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
=>
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/DiagnoseCharsetDetector.java
(87%)
create mode 100644
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
rename
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
=>
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/TrainCharsetModel.java
(65%)
create mode 100644 tika-ml/tika-ml-chardetect/src/test/python/anneal.py
delete mode 100644
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
delete mode 100644
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
delete mode 100644
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
delete mode 100644
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
copy tika-parsers/{tika-parsers-ml/tika-parser-tess4j-module =>
tika-http-jdk}/pom.xml (71%)
create mode 100644
tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java
create mode 100644
tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/pom.xml (67%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java (80%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java (92%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java (94%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java
(100%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java (93%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java (94%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java
(82%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java
(81%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
(100%)
rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module =>
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java
(82%)
create mode 100644
tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/FileListPipesIterator.java
create mode 100644
tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/FileListPipesIteratorTest.java
copy tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch =>
tika-pipes-es}/pom.xml (87%)
copy tika-pipes/tika-pipes-plugins/{tika-pipes-atlassian-jwt =>
tika-pipes-es}/src/main/assembly/assembly.xml (100%)
create mode 100644
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESClient.java
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitter.java}
(52%)
create mode 100644
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitterConfig.java
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitterFactory.java
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitterFactory.java}
(74%)
create mode 100644
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/HttpClientConfig.java
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es}/JsonResponse.java
(96%)
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-s3/src/main/java/org/apache/tika/pipes/plugin/s3/S3PipesPlugin.java
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/plugin/es/ESPipesPlugin.java}
(77%)
create mode 100644
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESPipesReporter.java
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/reporter/opensearch/OpenSearchReporterConfig.java
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESReporterConfig.java}
(67%)
copy
tika-pipes/tika-pipes-plugins/{tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterFactory.java
=>
tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESReporterFactory.java}
(68%)
copy tika-pipes/tika-pipes-plugins/{tika-pipes-az-blob =>
tika-pipes-es}/src/main/resources/plugin.properties (83%)
create mode 100644
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/emitter/es/ESClientTest.java