This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-4662
in repository https://gitbox.apache.org/repos/asf/tika.git


    from ae04d333f6 TIKA-4662 -- update common tokens and rebuild model
     add 0e53b3cff8 TIKA-4662 -- checkpoint
     add eaa968cd29 TIKA-4662 -- checkpoint
     add 7fded913f0 TIKA-4662 -- checkpoint
     add c4b596f453 TIKA-4662 -- checkpoint
     add f9475801d0 TIKA-4662 -- checkpoint
     add feff1e1639 TIKA-4662 -- checkpoint
     add 28a1e110b1 resolve stash pop conflicts
     add d4d441e783 Bump org.springframework:spring-context from 7.0.4 to 7.0.5 
(#2643)
     add e627b4b3e6 Bump org.apache.maven.plugins:maven-surefire-plugin from 
3.5.4 to 3.5.5 (#2644)
     add bf4b55424f Bump com.nimbusds:nimbus-jose-jwt from 10.7 to 10.8 (#2640)
     add 7568faf8b8 Bump com.fasterxml.jackson:jackson-bom from 2.21.0 to 
2.21.1 (#2631)
     add 7a43f5ce20 Bump twelvemonkeys.version from 3.13.0 to 3.13.1 (#2630)
     add 7616e9ebd8 Bump software.amazon.awssdk:bom from 2.41.29 to 2.41.34 
(#2629)
     add 2e33ce5214 Bump org.jetbrains:annotations from 26.0.2-1 to 26.1.0 
(#2635)
     add 4db18e5341 Bump org.apache.kafka:kafka-clients from 4.1.1 to 4.2.0 
(#2636)
     add e1d3bfe5c2 Bump net.sourceforge.tess4j:tess4j from 5.16.0 to 5.18.0 
(#2628)
     add c6cd5a71bf Bump google-auth-library-oauth2-http.version from 1.42.1 to 
1.43.0 (#2627)
     add 372d74f646 Bump com.googlecode.plist:dd-plist from 1.28 to 1.29 (#2625)
     add 9fb60a94b8 Bump com.mchange:mchange-commons-java from 0.3.2 to 0.4.0 
(#2626)
     add 1c62c0ab3d Bump org.apache.maven.plugins:maven-failsafe-plugin from 
3.5.4 to 3.5.5 (#2641)
     add 3f26230c4a Bump org.jetbrains.kotlin:kotlin-stdlib-jdk8 from 1.9.10 to 
2.3.10 (#2632)
     add 1f0ca1b9c6 Bump org.jetbrains.kotlin:kotlin-stdlib from 1.9.10 to 
2.3.10 (#2633)
     add 24daa28bc0 Bump org.codehaus.mojo:flatten-maven-plugin from 1.6.0 to 
1.7.3 (#2642)
     add acc65b8683 Bump commonmark.version from 0.24.0 to 0.27.1 (#2638)
     add bab4893999 Bump org.jetbrains.kotlin:kotlin-stdlib-common from 1.9.10 
to 2.0.21 (#2637)
     add ac1836b5ef Bump org.jetbrains.kotlin:kotlin-stdlib-jdk7 from 1.9.10 to 
2.3.10 (#2639)
     add 2a5bb03109 upgrade okhttp (#2646)
     add 52fc58d6f8 TIKA-4327: update aws
     add e7238901df TIKA-4675 -- improve wide unicode detection (#2647)
     add b9b6efae2b TIKA-4327: update c3p0
     add 608019c068 TIKA-4327: update aws
     add 3b53d0d1dd TIKA-4674 - progress timeout (#2650)
     add fdeb82f179 simplify serialization, take 2 (#2651)
     add 8c0329132a TIKA-4676 -- refactor inference and fix endian bug in 
ESEmitter (#2653)
     add 8d8f3440f7 TIKA-4327: update aws, jwarc
     add e5151b1e5b TIKA-4327: update google-api
     add 2fd8c0eda2 TIKA-4327: update microsoft-graph.version, maven.bundle, 
aws, google cloud, junrar, mockito, error_prone_annotations
     add 9451da2d56 TIKA-4606: Upgrade Apache Ignite from 2.x to 3.x (fresh) 
(#2654)
     add 2a9957a12b Bump org.tukaani:xz from 1.11 to 1.12 (#2670)
     add 0385b58466 Bump io.swagger.core.v3:swagger-annotations from 2.2.38 to 
2.2.43 (#2669)
     add 1d46c8b97f Bump org.jetbrains.kotlin:kotlin-stdlib from 2.2.0 to 
2.3.10 (#2663)
     add aaef3ca7a3 Bump info.picocli:picocli from 4.7.5 to 4.7.7 (#2661)
     add b9903d0840 Bump org.jetbrains:annotations from 26.0.2-1 to 26.1.0 
(#2659)
     add 4ba11a4e19 Bump org.yaml:snakeyaml from 2.4 to 2.6 (#2671)
     add 4c9017fca3 Bump jakarta.inject:jakarta.inject-api from 2.0.1 to 
2.0.1.MR (#2667)
     add 93b5cfa96f TIKA-4488: update micronaut
     add d139bfe02e TIKA-4488: add micronaut version
     add a7116b05d9 TIKA-4488: add micronaut version
     add 27933e64b9 TIKA-4327: add comment
     add 9f94799669 TIKA-4327: update tyrus, kiota, solrj, spotless-maven-plugin
     add 30e46db4fa TIKA-4606: Add e2e tests for Ignite 3.x upgrade (#2655)
     add ca67465e90 TIKA-4327: update aws, swagger, jackrabbit; add comment on 
solrj 10 migration
     add fdac94fc18 TIKA-4682 4x tweaks (#2674)
     add 599a0427a4 TIKA-4327: update aws, zookeeper, shade plugin, azure
     add b70d2ba1b0 TIKA-4327: remove dependency that is in parent
     add c0e0d8fccc TIKA-4327: remove dependency that is in parent
     add 96002a73d4 add md summary and other cli improvements (#2676)
     add ad61f26e75 TIKA-4672 - add an Elasticsearch emitter (#2622)
     add 9ac817e1c0 clean up bom pom
     add bb5a3f660e maybe fix flaky solr tests (#2678)
     add 9627b99958 TIKA-4685 chardet (#2677)
     add 364e3200e0 TIKA-4327: update aws
     add 2610663401 fix tests, revert errant .local-repo setting (#2680)
     add b3023c47bc TIKA-4685 - add annotation processor for jdk >23 (#2679)
     add 9c69600a72 Bump joda-time:joda-time from 2.14.0 to 2.14.1 (#2689)
     add 2f8b753d55 Bump org.apache.maven.plugins:maven-compiler-plugin (#2690)
     add 2f8c70ad2d Bump junit6.version from 6.1.0-M1 to 6.0.3 (#2688)
     add 9846674af0 Bump org.awaitility:awaitility from 4.2.0 to 4.3.0 (#2687)
     add c002b1b9d4 Bump org.apache.maven.plugins:maven-surefire-plugin from 
3.5.2 to 3.5.5 (#2686)
     add dd4a7e0979 Bump commons-logging:commons-logging from 1.3.5 to 1.3.6 
(#2685)
     add adf6269947 Bump org.apache.maven:maven-model from 3.9.12 to 3.9.13 
(#2684)
     add 28f276cccb Bump org.projectlombok:lombok from 1.18.32 to 1.18.42 
(#2682)
     add ea31f4d44b Bump org.slf4j:slf4j-api from 2.0.16 to 2.0.17 (#2681)
     add fe38956b28 Merge main into TIKA-4662
     add 09dd09ac5e fix: remove duplicate tika-ml and stale 
tika-charset-detectors from root pom.xml
     add 7e6844042c fix: restore tika-encoding-detectors files from main 
(correct package declarations)
     add 0cec71b082 TIKA-4662 -- checkpoint
     add 24c1afb8c7 TIKA-4662 -- checkpoint
     add f1994788a5 TIKA-4662 -- checkpoint
     add bf150690bc TIKA-4327: update pdfbox
     add 62158b7e16 TIKA-4614: activate disable part now that pdfbox 3.0.7 has 
been released
     add ccf1aee21c TIKA-4614: fix javadoc
     add b08ba012cf Merge branch 'main' into TIKA-4662

No new revisions were added by this update.

Summary of changes:
 .github/workflows/main-jdk17-build.yml             |    19 +
 .../main-jdk17-windows-build-multi-locale.yml      |     2 +-
 .github/workflows/main-jdk17-windows-build.yml     |     2 +-
 .gitignore                                         |     4 +-
 .java-version                                      |    18 -
 .mvn/maven.config                                  |     1 +
 docs/modules/ROOT/nav.adoc                         |     2 +
 .../pages/advanced/charset-detection-design.adoc   |   456 +
 .../advanced/charsoup-supported-languages.adoc     |   175 +
 .../advanced/lang-detection/flores-AUTOMATIC.log   |   521 +
 .../advanced/lang-detection/flores-SHORT_TEXT.log  |   399 +
 .../advanced/lang-detection/flores-STANDARD.log    |   473 +
 .../advanced/lang-detection/flores200-dev-eval.md  |   187 +
 .../lang-detection/language-drop-decisions.md      |   158 +
 .../short-text-language-decisions.md               |   367 +
 .../advanced/lang-detection/supported-languages.md |   223 +
 .../pages/advanced/language-detection-build.adoc   |   552 +-
 .../ROOT/pages/advanced/language-detection.adoc    |   264 +-
 docs/modules/ROOT/pages/pipes/index.adoc           |   101 +-
 .../ROOT/pages/pipes/shared-server-mode.adoc       |    10 +-
 docs/modules/ROOT/pages/pipes/timeouts.adoc        |   170 +
 docs/pom.xml                                       |    14 +
 pom.xml                                            |    10 +-
 run-lang-train.sh                                  |    30 -
 tika-app/pom.xml                                   |    14 +-
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |    10 +-
 .../test/resources/configs/config-template.json    |     6 +-
 tika-bom/pom.xml                                   |    30 -
 tika-charset-detectors/pom.xml                     |    43 -
 .../tika/detect/encoding/BomEncodingDetector.java  |   101 -
 .../detect/encoding/ByteNgramFeatureExtractor.java |   155 -
 .../tika/detect/encoding/FeatureExtractor.java     |    40 -
 .../encoding/HttpHeaderEncodingDetector.java       |    83 -
 .../apache/tika/detect/encoding/LinearModel.java   |   372 -
 .../tika/detect/encoding/MlEncodingDetector.java   |   377 -
 .../apache/tika/detect/encoding/Prediction.java    |    94 -
 .../detect/html/StandardHtmlEncodingDetector.java  |   113 -
 .../encoding/ByteNgramFeatureExtractorTest.java    |   120 -
 .../detect/encoding/CharsetConfusablesTest.java    |   180 -
 .../tika-charset-detectors-icu4j/pom.xml           |    58 -
 .../encoding/tools/EvalCharsetDetectors.java       |   326 -
 .../src/main/python/build_charset_training.py      |   562 -
 .../apache/tika/config/TikaProgressTracker.java    |    82 +
 .../org/apache/tika/config/TikaTaskTimeout.java    |    76 -
 .../java/org/apache/tika/config/TimeoutLimits.java |   112 +-
 .../org/apache/tika/detect/AutoDetectReader.java   |     8 +-
 .../java/org/apache/tika/detect}/BOMDetector.java  |    35 +-
 .../tika/detect/CompositeEncodingDetector.java     |   109 +-
 .../tika/detect/DefaultEncodingDetector.java       |    36 +-
 .../org/apache/tika/detect/EncodingDetector.java   |    48 +-
 .../tika/detect/EncodingDetectorContext.java       |    98 +-
 .../org/apache/tika/detect/EncodingResult.java     |   170 +
 .../tika/detect/MetadataCharsetDetector.java       |   132 +
 .../tika/detect/OverrideEncodingDetector.java      |     7 +-
 .../tika/language/detect/LanguageDetector.java     |    21 +-
 .../apache/tika/metadata/TikaCoreProperties.java   |     7 +
 .../metadata/filter/CompositeMetadataFilter.java   |    13 +-
 .../tika/metadata/filter/MetadataFilter.java       |    35 +-
 .../tika/metadata/filter/MetadataFilterBase.java   |     3 +-
 .../apache/tika/metadata/filter/NoOpFilter.java    |     3 +-
 .../filter/RemoveByMimeMetadataFilter.java         |     3 +-
 .../tika/parser/external2/ExternalParser.java      |     6 +-
 .../org.apache.tika.detect.EncodingDetector        |    18 +-
 .../tika/config/TikaProgressTrackerTest.java       |   103 +
 .../org/apache/tika/detect}/BOMDetectorTest.java   |    11 +-
 tika-e2e-tests/README.md                           |    12 +-
 tika-e2e-tests/pom.xml                             |    74 +-
 tika-e2e-tests/tika-grpc/README.md                 |   100 +-
 tika-e2e-tests/tika-grpc/pom.xml                   |    52 +-
 .../tika/parser/ocr/TesseractOCRConfig.properties  |    25 -
 .../customocr/tika-config-inline.json              |    26 -
 .../customocr/tika-config-inline.xml               |    49 -
 .../customocr/tika-config-rendered.json            |    28 -
 .../customocr/tika-config-rendered.xml             |    55 -
 .../tika/parser/journal/GrobidExtractor.properties |    16 -
 .../sample-configs/grobid/tika-config.json         |    23 -
 .../sample-configs/grobid/tika-config.xml          |    41 -
 .../tika-grpc/sample-configs/ignite/README.md      |   117 -
 .../sample-configs/ignite/tika-config-ignite.json  |     2 +-
 .../sample-configs/ner/run_tika_server.sh          |    62 -
 .../tika-grpc/sample-configs/ner/tika-config.json  |    26 -
 .../tika-grpc/sample-configs/ner/tika-config.xml   |    45 -
 .../tika-grpc/sample-configs/test-simple.json      |    20 -
 .../vision/inception-rest-caption.json             |    18 -
 .../vision/inception-rest-caption.xml              |    32 -
 .../vision/inception-rest-video.json               |    18 -
 .../sample-configs/vision/inception-rest-video.xml |    32 -
 .../sample-configs/vision/inception-rest.json      |    18 -
 .../sample-configs/vision/inception-rest.xml       |    32 -
 .../org/apache/tika/pipes/ExternalTestBase.java    |   285 +-
 .../pipes/filesystem/FileSystemFetcherTest.java    |    79 +-
 .../tika/pipes/ignite/IgniteConfigStoreTest.java   |   679 +-
 .../java/org/apache/tika/pipes/ignite/README.md    |   172 -
 .../src/test/resources/docker-compose-ignite.yml   |    25 -
 .../src/test/resources/docker-compose.yml          |    16 -
 .../tika-grpc/src/test/resources/log4j2.xml        |    19 -
 .../src/test/resources/test-fixtures/sample.csv    |     4 +
 .../src/test/resources/test-fixtures/sample.html   |     8 +
 .../src/test/resources/test-fixtures/sample.txt    |     3 +
 .../src/test/resources/test-fixtures/sample.xml    |     5 +
 ...g-ignite.json => tika-config-ignite-local.json} |     4 +-
 .../src/test/resources/tika-config-ignite.json     |     2 +-
 .../tika-grpc/src/test/resources/tika-config.json  |    49 +-
 {tika-bundles => tika-encoding-detectors}/pom.xml  |    45 +-
 .../tika-encoding-detector-charsoup}/pom.xml       |    47 +-
 .../charsoup/CharSoupEncodingDetector.java         |   383 +
 .../org.apache.tika.detect.EncodingDetector        |     8 +-
 .../charsoup/CharSoupEncodingDetectorTest.java     |    54 +-
 .../langdetect/charsoup/TextQualityDiagTest.java   |     0
 .../tika-encoding-detector-html}/pom.xml           |    36 +-
 .../tika/parser}/html/HtmlEncodingDetector.java    |    25 +-
 .../html/charsetdetector}/CharsetAliases.java      |    19 +-
 .../charsetdetector}/CharsetDetectionResult.java   |     2 +-
 .../html/charsetdetector}/MetaProcessor.java       |     4 +-
 .../parser/html/charsetdetector}/PreScanner.java   |    15 +-
 .../StandardHtmlEncodingDetector.java              |   138 +
 .../charsets/ReplacementCharset.java               |     2 +-
 .../charsets/XUserDefinedCharset.java              |     2 +-
 .../org.apache.tika.detect.EncodingDetector        |     3 +-
 .../tika-encoding-detector-icu4j}/pom.xml          |    58 +-
 .../apache/tika/parser/txt/CharsetDetector.java    |     0
 .../org/apache/tika/parser/txt/CharsetMatch.java   |     0
 .../apache/tika/parser/txt/CharsetRecog_2022.java  |     0
 .../apache/tika/parser/txt/CharsetRecog_UTF8.java  |     0
 .../tika/parser/txt/CharsetRecog_Unicode.java      |     0
 .../apache/tika/parser/txt/CharsetRecog_mbcs.java  |     0
 .../apache/tika/parser/txt/CharsetRecog_sbcs.java  |     0
 .../apache/tika/parser/txt/CharsetRecognizer.java  |     0
 .../tika/parser/txt}/Icu4jEncodingDetector.java    |    53 +-
 .../tika/parser/txt/CharsetDetectorTest.java       |     1 +
 .../configs/tika-config-ignore-charset.json        |     0
 .../resources/test-documents/multi-language.txt    |     0
 .../src/test/resources/test-documents/resume.html  |     0
 .../resources/test-documents/testIgnoreCharset.txt |     0
 .../resources/test-documents/testTXT_win-1252.txt  |     0
 .../test-documents/test_ignore_IBM420.html         |   Bin
 .../tika-encoding-detector-mojibuster}/pom.xml     |    36 +-
 .../ml/chardetect/ByteNgramFeatureExtractor.java   |   290 +
 .../tika/ml/chardetect}/CharsetConfusables.java    |   111 +-
 .../tika/ml/chardetect/CjkEncodingRules.java       |   400 +
 .../ml/chardetect/MojibusterEncodingDetector.java  |   720 +
 .../ml/chardetect}/StructuralEncodingRules.java    |   228 +-
 .../org.apache.tika.detect.EncodingDetector        |     6 +-
 .../org/apache/tika/ml/chardetect/chardetect.bin   |   Bin 0 -> 606934 bytes
 .../tika/ml/chardetect/EbcdicRoutingTest.java      |   104 +
 .../ml/chardetect/ZipFilenameDetectionTest.java    |   132 +
 .../tika-encoding-detector-universal}/pom.xml      |    36 +-
 .../parser/txt}/UniversalEncodingDetector.java     |    53 +-
 .../parser/txt}/UniversalEncodingListener.java     |     2 +-
 tika-eval/tika-eval-app/pom.xml                    |    29 +-
 .../tika-eval-app}/src/main/assembly/assembly.xml  |     4 -
 .../java/org/apache/tika/eval/app/EvalConfig.java  |     2 +-
 .../tika/eval/app/ExtractComparerRunner.java       |    88 +-
 .../java/org/apache/tika/eval/app/TikaEvalCLI.java |    58 +
 .../eval/app/reports/MarkdownSummaryWriter.java    |   611 +
 .../tika/eval/app/reports/ResultsReporter.java     |     7 +-
 .../apache/tika/eval/app/SimpleComparerTest.java   |    18 +-
 .../org/apache/tika/eval/app/TikaEvalCLITest.java  |    13 +-
 .../resources/test-dirs/extractsB/file1.pdf.json   |     2 +-
 .../src/main/resources/common_tokens/ace           |  1578 +
 .../src/main/resources/common_tokens/afr           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/aka           |  9272 +++
 .../src/main/resources/common_tokens/alt           |  2668 +
 .../src/main/resources/common_tokens/amh           | 19139 +++---
 .../src/main/resources/common_tokens/ami           |  2574 +
 .../src/main/resources/common_tokens/ara           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/arg           | 32637 +++++++---
 .../src/main/resources/common_tokens/arz           | 17123 ------
 .../src/main/resources/common_tokens/asm           | 10793 ++--
 .../src/main/resources/common_tokens/ast           | 30020 ----------
 .../src/main/resources/common_tokens/ava           |  2453 +
 .../src/main/resources/common_tokens/avk           |  6710 +++
 .../src/main/resources/common_tokens/azb           | 26484 ++++++++
 .../src/main/resources/common_tokens/aze           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/bak           | 59988 +++++++++---------
 .../src/main/resources/common_tokens/bam           |  1192 -
 .../src/main/resources/common_tokens/ban           | 12613 ++--
 .../src/main/resources/common_tokens/bar           | 31854 +++++-----
 .../src/main/resources/common_tokens/bcl           | 14470 ++++-
 .../src/main/resources/common_tokens/be-x-old      | 30020 ++++++++++
 .../src/main/resources/common_tokens/bel           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ben           | 44061 +++++---------
 .../src/main/resources/common_tokens/bjn           |  8705 +--
 .../src/main/resources/common_tokens/bos           | 30020 ----------
 .../src/main/resources/common_tokens/bpy           |  1001 -
 .../src/main/resources/common_tokens/bre           | 47871 +++++++++------
 .../src/main/resources/common_tokens/bua           |  2734 -
 .../src/main/resources/common_tokens/bul           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/bxr           |  3556 ++
 .../src/main/resources/common_tokens/cat           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/cdo-x-rom     |   633 +
 .../src/main/resources/common_tokens/ceb           | 60002 +++++++++---------
 .../src/main/resources/common_tokens/ces           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/che           | 36578 +++++++++--
 .../src/main/resources/common_tokens/chv           | 24222 +++++---
 .../src/main/resources/common_tokens/ckb           | 46397 ++++++++------
 .../src/main/resources/common_tokens/cnh           | 17224 ++++++
 .../src/main/resources/common_tokens/cor           |  3558 ++
 .../src/main/resources/common_tokens/cos           | 10474 +++-
 .../src/main/resources/common_tokens/csb           |  3585 +-
 .../src/main/resources/common_tokens/cym           | 50185 +++++++++-------
 .../src/main/resources/common_tokens/dag           |  4433 ++
 .../src/main/resources/common_tokens/dan           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/deu           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/diq           | 11793 ++--
 .../src/main/resources/common_tokens/div           | 31671 +---------
 .../src/main/resources/common_tokens/dsb           |  3542 +-
 .../src/main/resources/common_tokens/ell           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/eml           |  5000 --
 .../src/main/resources/common_tokens/eng           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/epo           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/est           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/eus           | 59994 +++++++++---------
 .../src/main/resources/common_tokens/ewe           | 12375 +++-
 .../src/main/resources/common_tokens/ext           |  5545 +-
 .../src/main/resources/common_tokens/fao           | 39567 +++---------
 .../src/main/resources/common_tokens/fas           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/fin           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/fra           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/frr           |  6648 +-
 .../src/main/resources/common_tokens/fry           | 59942 +++++++++---------
 .../src/main/resources/common_tokens/gla           |  6667 ++
 .../src/main/resources/common_tokens/gle           | 52183 ++++++++--------
 .../src/main/resources/common_tokens/glg           | 59850 +++++++++---------
 .../src/main/resources/common_tokens/glv           |  6204 +-
 .../src/main/resources/common_tokens/gom           | 13378 ++---
 .../src/main/resources/common_tokens/grn           |  7802 +--
 .../src/main/resources/common_tokens/gsw           | 59486 +++++++++---------
 .../src/main/resources/common_tokens/guj           | 24230 ++------
 .../src/main/resources/common_tokens/hak-x-rom     |   693 +
 .../src/main/resources/common_tokens/hat           |  6627 --
 .../src/main/resources/common_tokens/hau           | 29608 ++++++---
 .../src/main/resources/common_tokens/hbs           | 30020 ----------
 .../src/main/resources/common_tokens/heb           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hif           |  2137 -
 .../src/main/resources/common_tokens/hil           | 18496 ++++++
 .../src/main/resources/common_tokens/hin           | 36421 ++---------
 .../src/main/resources/common_tokens/hrv           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hsb           | 13309 ++--
 .../src/main/resources/common_tokens/hun           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hye           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hyw           | 19125 ++++++
 .../src/main/resources/common_tokens/ibo           | 19379 +++++-
 .../src/main/resources/common_tokens/ido           | 21997 ++++---
 .../src/main/resources/common_tokens/ile           |  4013 +-
 .../src/main/resources/common_tokens/ilo           | 10135 ++--
 .../src/main/resources/common_tokens/ina           | 15743 ++---
 .../src/main/resources/common_tokens/ind           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/isl           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ita           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/jav           | 59433 +++++++++---------
 .../src/main/resources/common_tokens/jbo           |  1071 +
 .../src/main/resources/common_tokens/jpn           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/kaa           |  4617 ++
 .../src/main/resources/common_tokens/kab           |  3263 +
 .../src/main/resources/common_tokens/kal           |  6341 --
 .../src/main/resources/common_tokens/kan           | 58858 +++++++++---------
 .../src/main/resources/common_tokens/kat           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/kaz           | 59998 +++++++++---------
 .../src/main/resources/common_tokens/kha           |  9653 +++
 .../src/main/resources/common_tokens/khk           |  4187 --
 .../src/main/resources/common_tokens/khm           |  8745 +++
 .../src/main/resources/common_tokens/kin           | 15225 ++---
 .../src/main/resources/common_tokens/kir           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/koi           |  1373 -
 .../src/main/resources/common_tokens/kom           |  2382 -
 .../src/main/resources/common_tokens/kor           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/kpv           |  3445 ++
 .../src/main/resources/common_tokens/krc           |  1974 -
 .../src/main/resources/common_tokens/ksh           |  5117 +-
 .../src/main/resources/common_tokens/kur           | 32152 ++++++----
 .../src/main/resources/common_tokens/lad           |  1681 -
 .../src/main/resources/common_tokens/lao           |  1479 +-
 .../src/main/resources/common_tokens/lat           | 59988 +++++++++---------
 .../src/main/resources/common_tokens/lav           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/lez           |  3913 ++
 .../src/main/resources/common_tokens/lfn           |  5582 ++
 .../src/main/resources/common_tokens/lim           | 48870 ++++++---------
 .../src/main/resources/common_tokens/lit           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/lmo           |  6924 ---
 .../src/main/resources/common_tokens/ltz           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/lug           | 35119 ++---------
 .../src/main/resources/common_tokens/lup           |   905 -
 .../src/main/resources/common_tokens/lus           | 40004 ++++++++----
 .../src/main/resources/common_tokens/mai           |   755 -
 .../src/main/resources/common_tokens/mal           | 59240 +++++++++---------
 .../src/main/resources/common_tokens/mar           | 39628 +++---------
 .../src/main/resources/common_tokens/mhr           |  9769 +--
 .../src/main/resources/common_tokens/min           | 29209 +++++----
 .../src/main/resources/common_tokens/mkd           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/mlg           | 38614 +++++++++---
 .../src/main/resources/common_tokens/mlt           | 47776 ++++++---------
 .../src/main/resources/common_tokens/mon           | 57279 +++++++++---------
 .../src/main/resources/common_tokens/mri           |  9729 ---
 .../src/main/resources/common_tokens/mrj           |  3902 +-
 .../src/main/resources/common_tokens/msa           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/mwl           | 25861 ++++----
 .../src/main/resources/common_tokens/mya           | 30020 ++++++++++
 .../src/main/resources/common_tokens/myv           |  3154 +-
 .../src/main/resources/common_tokens/mzn           | 10405 ++--
 .../src/main/resources/common_tokens/nan           |  6673 ---
 .../src/main/resources/common_tokens/nap           |  2039 -
 .../src/main/resources/common_tokens/nav           |   533 -
 .../src/main/resources/common_tokens/ndo           |  3142 -
 .../src/main/resources/common_tokens/nds           | 54491 +++++++++--------
 .../src/main/resources/common_tokens/nep           | 35836 ++---------
 .../src/main/resources/common_tokens/new           |  2545 -
 .../src/main/resources/common_tokens/nld           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/nno           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/nob           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/nqo           |  2779 +
 .../src/main/resources/common_tokens/nso           |  5150 +-
 .../src/main/resources/common_tokens/nya           | 30020 ++++++++++
 .../src/main/resources/common_tokens/olo           |  2220 +
 .../src/main/resources/common_tokens/ori           | 12884 ++--
 .../src/main/resources/common_tokens/orm           | 31991 +++++++++-
 .../src/main/resources/common_tokens/oss           |  7575 ++-
 .../src/main/resources/common_tokens/pam           |  6310 +-
 .../src/main/resources/common_tokens/pan           | 11564 +---
 .../src/main/resources/common_tokens/pap           | 12598 ++--
 .../src/main/resources/common_tokens/pfl           |  4325 +-
 .../src/main/resources/common_tokens/pms           |  6552 --
 .../src/main/resources/common_tokens/pnb           | 57576 +++++++++---------
 .../src/main/resources/common_tokens/pol           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/por           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/prs           | 12167 ----
 .../src/main/resources/common_tokens/pus           | 51255 +++++++++-------
 .../src/main/resources/common_tokens/que           |  2170 -
 .../src/main/resources/common_tokens/roh           | 33539 ++++-------
 .../src/main/resources/common_tokens/ron           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/rue           |  6615 +-
 .../src/main/resources/common_tokens/run           |  3534 --
 .../src/main/resources/common_tokens/rus           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sah           | 31037 +++++-----
 .../src/main/resources/common_tokens/san           | 14998 ++---
 .../src/main/resources/common_tokens/sat           |  6387 ++
 .../src/main/resources/common_tokens/scn           |  7559 ---
 .../src/main/resources/common_tokens/sco           | 12070 ----
 .../src/main/resources/common_tokens/sgs           |  5547 +-
 .../src/main/resources/common_tokens/sin           | 34762 +++++------
 .../src/main/resources/common_tokens/skr           |  6326 ++
 .../src/main/resources/common_tokens/slk           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/slv           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sme           |  5891 +-
 .../src/main/resources/common_tokens/smi           |  1676 -
 .../src/main/resources/common_tokens/smn           |  2934 +
 .../src/main/resources/common_tokens/smo           | 24490 ++++++++
 .../src/main/resources/common_tokens/sna           | 29768 ++-------
 .../src/main/resources/common_tokens/snd           | 26767 +++++----
 .../src/main/resources/common_tokens/som           | 32074 +++-------
 .../src/main/resources/common_tokens/sot           |  3535 --
 .../src/main/resources/common_tokens/spa           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sqi           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/srd           |  3796 --
 .../src/main/resources/common_tokens/srp           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ssw           |  2035 -
 .../src/main/resources/common_tokens/stq           |  3423 ++
 .../src/main/resources/common_tokens/sun           | 45923 ++++++--------
 .../src/main/resources/common_tokens/swe           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/swh           | 28444 +++++++--
 .../src/main/resources/common_tokens/szl           |  8763 ++-
 .../src/main/resources/common_tokens/szy           |  4825 ++
 .../src/main/resources/common_tokens/tam           | 44673 +++++---------
 .../src/main/resources/common_tokens/tat           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/tay           |  1220 +
 .../src/main/resources/common_tokens/tel           | 44436 +++++---------
 .../src/main/resources/common_tokens/tet           | 24501 ++++++++
 .../src/main/resources/common_tokens/tgk           | 57085 +++++++++---------
 .../src/main/resources/common_tokens/tgl           | 59972 +++++++++---------
 .../src/main/resources/common_tokens/tha           | 53250 +++++++---------
 .../src/main/resources/common_tokens/tir           | 30020 ++++++++++
 .../src/main/resources/common_tokens/trv           |  3333 +
 .../src/main/resources/common_tokens/tsn           | 17815 ++++--
 .../src/main/resources/common_tokens/tso           | 15810 +++--
 .../src/main/resources/common_tokens/tuk           | 30868 ++++------
 .../src/main/resources/common_tokens/tum           |  4881 ++
 .../src/main/resources/common_tokens/tur           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/tyv           |  8541 ++-
 .../src/main/resources/common_tokens/udm           | 31389 +++++++++-
 .../src/main/resources/common_tokens/uig           | 31403 +++++-----
 .../src/main/resources/common_tokens/ukr           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/urd           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/uzb           | 59970 +++++++++---------
 .../src/main/resources/common_tokens/uzn           | 30020 ----------
 .../src/main/resources/common_tokens/ven           |  2457 -
 .../src/main/resources/common_tokens/vep           |  7276 +++
 .../src/main/resources/common_tokens/vie           | 51687 +++++++---------
 .../src/main/resources/common_tokens/vls           | 15292 ++---
 .../src/main/resources/common_tokens/vol           |  7216 +--
 .../src/main/resources/common_tokens/vro           |  4730 +-
 .../src/main/resources/common_tokens/war           | 55938 ++++++++---------
 .../src/main/resources/common_tokens/wln           | 12282 ++--
 .../src/main/resources/common_tokens/wuu           | 30020 ----------
 .../src/main/resources/common_tokens/xho           | 51611 +++++++++-------
 .../src/main/resources/common_tokens/xmf           | 15787 +++--
 .../src/main/resources/common_tokens/ydd           | 21613 ++++---
 .../src/main/resources/common_tokens/yor           |  7833 ++-
 .../src/main/resources/common_tokens/yue           |  9176 +++
 .../src/main/resources/common_tokens/zea           |  2318 -
 .../src/main/resources/common_tokens/zho           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/zul           | 35021 ++---------
 .../core/metadata/TikaEvalMetadataFilterTest.java  |    13 +-
 .../tika/eval/core/textstats/TextStatsTest.java    |    28 +-
 .../core/tokens/tools/CommonTokenGenerator.java    |   102 +-
 tika-example/pom.xml                               |     2 +-
 .../java/org/apache/tika/example/Language.java     |     5 +-
 .../tika/example/LanguageDetectorExample.java      |     3 +-
 .../java/org/apache/tika/example/MyFirstTika.java  |     3 +-
 .../tika/example/PipesForkParserExample.java       |    10 +-
 .../tika/example/LanguageDetectorExampleTest.java  |     2 +-
 tika-grpc/dev-tika-config.json                     |     3 +-
 tika-grpc/pom.xml                                  |    22 +-
 tika-grpc/run-dev.sh                               |    15 +-
 .../org/apache/tika/pipes/grpc/TikaGrpcServer.java |     7 +-
 .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java |    39 +-
 tika-grpc/src/main/proto/tika.proto                |     2 +
 .../src/test/resources/tika-config-ignite.json     |     2 +-
 .../src/test/resources/tika-pipes-test-config.json |    14 +-
 tika-integration-tests/pom.xml                     |     1 +
 .../pom.xml                                        |    19 +-
 .../elasticsearch/tests/ElasticsearchTest.java     |   740 +
 .../tests/ElasticsearchTestClient.java}            |    49 +-
 .../elasticsearch/elasticsearch-mappings.json}     |     0
 .../elasticsearch-parent-child-mappings.json}      |     0
 .../elasticsearch-vector-mappings.json             |    17 +
 .../resources/elasticsearch}/plugins-template.json |    46 +-
 .../resources/pipes-fork-server-custom-log4j2.xml  |     0
 .../src/test/resources/test-documents/fake_oom.xml |     0
 .../src/test/resources/test-documents/npe.xml      |     0
 .../src/test/resources/test-documents/oom.xml      |     0
 .../test-documents}/test_recursive_embedded.docx   |   Bin
 .../tika/pipes/kafka/tests/TikaPipesKafkaTest.java |     5 +-
 .../src/test/resources/kafka/plugins-template.json |     1 -
 .../resources/opensearch/plugins-template.json     |    11 +-
 .../opensearch/tika-config-opensearch.json         |    11 +-
 .../src/test/resources/s3/plugins-template.json    |     1 -
 .../pipes/solr/tests/TikaPipesSolrTestBase.java    |    60 +-
 .../src/test/resources/solr/plugins-template.json  |    11 +-
 .../src/test/resources/tika-config-solr-urls.json  |     5 +-
 tika-langdetect/pom.xml                            |     3 +-
 .../charsoup/CharSoupFeatureExtractor.java         |    12 +-
 .../tika/langdetect/charsoup/CharSoupModel.java    |   126 +-
 .../tika/langdetect/charsoup/FeatureExtractor.java |    39 +
 .../charsoup/ScriptAwareFeatureExtractor.java      |   252 +-
 .../tika/langdetect/charsoup/ScriptCategory.java   |    13 +-
 .../charsoup/ShortTextFeatureExtractor.java        |   348 +
 .../charsoup/langdetect-short-v1-20260310.bin      |   Bin 0 -> 3999308 bytes
 .../langdetect/charsoup/langdetect-v7-20260306.bin |   Bin 0 -> 3328628 bytes
 .../apache/tika/langdetect/charsoup/langdetect.bin |   Bin 1641016 -> 0 bytes
 tika-langdetect/tika-langdetect-charsoup/pom.xml   |    23 +-
 .../charsoup/CharSoupDetectorConfig.java           |   120 +
 .../charsoup/CharSoupEncodingDetector.java         |   208 -
 .../charsoup/CharSoupLanguageDetector.java         |   599 +-
 .../charsoup/CharSoupMetadataFilter.java}          |    36 +-
 .../tika/langdetect/charsoup/ConfusableGroups.java |    72 +
 .../langdetect/charsoup/LanguageConfusables.java   |   195 -
 .../src/main/python/extract_madlad_to_wiki.py      |   182 +
 .../tika/langdetect/charsoup/confusables.txt       |    37 +
 .../charsoup/CharSoupDetectorConfigTest.java       |   125 +
 .../charsoup/CharSoupFeatureExtractorTest.java     |     2 +-
 .../charsoup/CharSoupModelRoutingTest.java         |   281 +
 .../langdetect/charsoup/LangIdRegressionTest.java  |    25 +-
 .../tika/langdetect/charsoup/LinearModelTest.java  |     2 +-
 .../charsoup/ScriptAwareFeatureExtractorTest.java  |    33 +-
 .../langdetect/charsoup/SjisLangSignalTest.java    |   253 +
 .../langdetect/charsoup/tools/AblationRunner.java  |   652 +-
 .../charsoup/tools/BucketSaturationAnalyzer.java   |    76 +-
 .../charsoup/tools/CalibrateConfidence.java        |   217 +
 .../charsoup/tools/CompareDetectors.java           |  1470 +-
 .../langdetect/charsoup/tools/ConfusionDump.java   |   184 +
 .../langdetect/charsoup/tools/CorpusReader.java    |    73 +-
 .../langdetect/charsoup/tools/CrossDomainEval.java |   309 +-
 .../charsoup/tools/DiagnoseUnknownScript.java      |   145 +
 .../charsoup/tools/KoreanFalsePositives.java       |   146 +
 .../langdetect/charsoup/tools/ModelQuantizer.java  |    28 +-
 .../langdetect/charsoup/tools/Phase2SmokeTest.java |     2 +-
 .../langdetect/charsoup/tools/Phase2Trainer.java   |   102 +
 .../langdetect/charsoup/tools/PrepareCorpus.java   |   992 +
 .../langdetect/charsoup/tools/QuickF1Eval.java     |   134 +-
 .../charsoup/tools/ResearchFeatureExtractor.java   |   457 +
 .../charsoup/tools/TrainLanguageModel.java         |   982 +-
 .../langdetect/charsoup/tools/TrainShortModel.java |   179 +
 .../src/test/python/check_script_consistency.py    |   253 +
 .../src/test/python/clean_madlad.py                |   308 +
 .../src/test/python/collect_wikipedia.py           |   556 +
 .../src/test/python/diagnose_kor_eng.py            |   256 +
 .../src/test}/python/download_madlad.py            |     0
 .../src/test/python/eval_fasttext.py               |   290 +
 .../src/test/python/filter_contamination.py        |   240 +
 .../src/test/python/filter_pashto.py               |    66 +-
 .../src/test/python/filter_uppercase.py            |   130 +
 .../src/test/python/summarize_wikipedia.py         |   170 +
 tika-langdetect/tika-langdetect-optimaize/pom.xml  |     2 +-
 tika-langdetect/tika-langdetect-tika/pom.xml       |    75 -
 .../tika/langdetect/tika/LanguageIdentifier.java   |   260 -
 .../tika/langdetect/tika/LanguageProfile.java      |   317 -
 .../langdetect/tika/LanguageProfilerBuilder.java   |   767 -
 .../tika/langdetect/tika/ProfilingWriter.java      |   103 -
 .../tika/langdetect/tika/TikaLanguageDetector.java |    92 -
 ...rg.apache.tika.language.detect.LanguageDetector |    16 -
 .../org/apache/tika/langdetect/tika/be.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ca.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/da.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/de.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/el.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/en.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/eo.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/es.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/et.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/fa.ngp         |  1015 -
 .../org/apache/tika/langdetect/tika/fi.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/fr.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/gl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/hu.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/is.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/it.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/lt.ngp         |  1209 -
 .../org/apache/tika/langdetect/tika/nl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/no.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/pl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/pt.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ro.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ru.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sk.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sv.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/th.ngp         |  1014 -
 .../tika/langdetect/tika/tika.language.properties  |    56 -
 .../org/apache/tika/langdetect/tika/uk.ngp         |  1014 -
 .../langdetect/tika/LanguageIdentifierTest.java    |   185 -
 .../tika/langdetect/tika/LanguageProfileTest.java  |    58 -
 .../tika/LanguageProfilerBuilderTest.java          |    96 -
 .../tika/langdetect/tika/ProfilingHandler.java     |    67 -
 .../tika/langdetect/tika/ProfilingWriterTest.java  |    45 -
 .../org/apache/tika/langdetect/tika/da.test        |   108 -
 .../org/apache/tika/langdetect/tika/de.test        |   104 -
 .../org/apache/tika/langdetect/tika/el.test        |   109 -
 .../org/apache/tika/langdetect/tika/en.test        |   105 -
 .../org/apache/tika/langdetect/tika/es.test        |   107 -
 .../org/apache/tika/langdetect/tika/et.test        |    17 -
 .../org/apache/tika/langdetect/tika/fi.test        |   106 -
 .../org/apache/tika/langdetect/tika/fr.test        |   105 -
 .../org/apache/tika/langdetect/tika/it.test        |   109 -
 .../langdetect/tika/langbuilder/welsh_corpus.txt   |  2602 -
 .../org/apache/tika/langdetect/tika/lt.test        |    32 -
 .../org/apache/tika/langdetect/tika/nl.test        |   105 -
 .../org/apache/tika/langdetect/tika/pt.test        |   105 -
 .../org/apache/tika/langdetect/tika/sv.test        |   108 -
 tika-ml/tika-ml-chardetect/README.md               |    76 +
 tika-ml/tika-ml-chardetect/pom.xml                 |    16 +
 .../tika/ml/chardetect/MlEncodingDetector.java     |   201 -
 .../tools/BenchmarkCharsetDetectors.java           |    12 +-
 .../chardetect/tools/BuildCharsetTrainingData.java |  1030 +
 .../ConfigurableByteNgramFeatureExtractor.java     |   254 +
 .../chardetect}/tools/DiagnoseCharsetDetector.java |    32 +-
 .../ml/chardetect/tools/EvalCharsetDetectors.java  |   424 +
 .../ml/chardetect}/tools/TrainCharsetModel.java    |   158 +-
 .../tika-ml-chardetect/src/test/python/anneal.py   |   379 +
 .../charsoup/CharSoupFeatureExtractor.java         |   456 -
 .../charsoup/ScriptAwareFeatureExtractor.java      |   399 -
 .../tika/langdetect/charsoup/ScriptCategory.java   |   117 -
 .../tika/langdetect/charsoup/WordTokenizer.java    |   225 -
 .../main/java/org/apache/tika/ml/LinearModel.java  |    32 +-
 tika-parent/pom.xml                                |   155 +-
 tika-parsers/pom.xml                               |     2 +
 .../pom.xml                                        |    46 +-
 .../java/org/apache/tika/http/TikaHttpClient.java  |   150 +
 .../org/apache/tika/http/TikaTestHttpServer.java   |   268 +
 .../org/apache/tika/parser/gdal/GDALParser.java    |     6 +-
 tika-parsers/tika-parsers-ml/pom.xml               |     2 +-
 .../tika-parsers-ml/tika-inference/pom.xml         |    48 +-
 .../tika/inference/AbstractEmbeddingFilter.java    |    28 +-
 .../org/apache/tika/inference/ChunkSerializer.java |    11 +-
 .../tika/inference/ImageEmbeddingConfig.java       |     3 +-
 .../org/apache/tika/inference/InferenceConfig.java |    23 +-
 .../tika/inference/OpenAIEmbeddingFilter.java      |    64 +-
 .../tika/inference/OpenAIImageEmbeddingParser.java |    97 +-
 .../apache/tika/inference/VectorSerializer.java    |    22 +-
 .../tika/inference/OpenAIEmbeddingFilterTest.java  |    70 +-
 .../inference/OpenAIImageEmbeddingParserTest.java  |   131 +-
 .../tika/inference/VectorSerializerTest.java       |    13 +
 .../tika-parsers-ml/tika-parser-nlp-module/pom.xml |     2 +-
 .../apache/tika/parser/ner/NamedEntityParser.java  |     2 +
 .../src/test/resources/configs/tika-config.json    |     2 +-
 .../tika-parser-tess4j-module/pom.xml              |     2 +-
 .../tika/parser/ocr/tess4j/Tess4JParser.java       |     6 +-
 .../pom.xml                                        |    53 +-
 .../apache/tika/parser/vlm/AbstractVLMParser.java  |   107 +-
 .../apache/tika/parser/vlm/ClaudeVLMParser.java    |    21 +-
 .../apache/tika/parser/vlm/GeminiVLMParser.java    |    14 +-
 .../tika/parser/vlm/MarkdownToXHTMLEmitter.java    |     0
 .../apache/tika/parser/vlm/OpenAIVLMParser.java    |    23 +-
 .../org/apache/tika/parser/vlm/VLMOCRConfig.java   |    17 +
 .../tika/parser/vlm/ClaudeVLMParserTest.java       |    62 +-
 .../tika/parser/vlm/GeminiVLMParserTest.java       |    64 +-
 .../parser/vlm/MarkdownToXHTMLEmitterTest.java     |     0
 .../tika/parser/vlm/OpenAIVLMParserTest.java       |    69 +-
 .../tika-parser-html-module/pom.xml                |    17 +-
 .../org/apache/tika/parser/html/JSoupParser.java   |     7 +-
 .../tika/parser/html/HtmlEncodingDetectorTest.java |     7 +-
 .../apache/tika/parser/html/HtmlParserTest.java    |    36 +-
 .../html/StandardHtmlEncodingDetectorTest.java     |    10 +-
 .../tika-parser-microsoft-module/pom.xml           |    11 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |     7 +-
 .../microsoft/POIContainerExtractionTest.java      |     4 +-
 .../tika-parser-miscoffice-module/pom.xml          |    16 +-
 .../java/org/apache/tika/parser/dbf/DBFParser.java |     7 +-
 .../apache/tika/parser/ocr/TesseractOCRParser.java |     6 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |     4 +-
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  |     7 +
 .../java/org/apache/tika/parser/pkg/ZipParser.java |    18 +-
 .../tika-parser-text-module/pom.xml                |    18 +-
 .../apache/tika/parser/strings/StringsParser.java  |    11 +-
 .../tika/parser/csv/TextAndCSVParserTest.java      |    12 +-
 .../org/apache/tika/parser/txt/TXTParserTest.java  |   103 +-
 .../apache/tika/parser/xmp/XMPPacketScanner.java   |     7 +-
 .../tika/parser/xmp/XmpboxExtractorTest.java       |     2 -
 .../tika-parsers-standard-package/pom.xml          |    42 +-
 .../tika/config/TikaEncodingDetectorTest.java      |   178 +-
 .../apache/tika/parser/AutoDetectParserTest.java   |     6 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |     6 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |    35 +-
 .../apache/tika/parser/pkg/PackageParserTest.java  |     3 +
 ...IKA-2273-exclude-encoding-detector-default.json |     2 +-
 .../TIKA-2273-parameterize-encoding-detector.json  |     4 +-
 .../TIKA-2485-encoding-detector-mark-limits.json   |    13 +-
 .../tika/async/cli/FileListPipesIterator.java      |   122 +
 .../org/apache/tika/async/cli/PluginsWriter.java   |   119 +-
 .../org/apache/tika/async/cli/TikaAsyncCLI.java    |   102 +-
 .../apache/tika/async/cli/AsyncCliParserTest.java  |    44 +-
 .../tika/async/cli/FileListPipesIteratorTest.java  |   103 +
 .../test/resources/configs/config-template.json    |     6 +-
 .../org/apache/tika/client/HttpClientFactory.java  |    85 +-
 tika-pipes/tika-pipes-api/pom.xml                  |    19 +
 .../java/org/apache/tika/pipes/api/ParseMode.java  |     3 +
 tika-pipes/tika-pipes-config-store-ignite/pom.xml  |    68 +-
 .../tika/pipes/ignite/ExtensionConfigDTO.java      |    29 +-
 .../tika/pipes/ignite/IgniteConfigStore.java       |   182 +-
 .../ignite/config/IgniteConfigStoreConfig.java     |    59 +-
 .../pipes/ignite/server/IgniteStoreServer.java     |   201 +-
 .../tika/pipes/ignite/IgniteConfigStoreTest.java   |   119 +-
 .../org/apache/tika/pipes/core/PipesClient.java    |    41 +-
 .../org/apache/tika/pipes/core/PipesConfig.java    |    15 -
 .../tika/pipes/core/async/AsyncProcessor.java      |    33 +-
 .../tika/pipes/core/config/ConfigMerger.java       |    22 +-
 .../tika/pipes/core/config/ConfigOverrides.java    |    39 +-
 .../tika/pipes/core/protocol/PipesMessage.java     |    16 +-
 .../tika/pipes/core/server/ConnectionHandler.java  |    77 +-
 .../apache/tika/pipes/core/server/EmitHandler.java |     2 +-
 .../core/server/MetadataListAndEmbeddedBytes.java  |     5 +-
 .../apache/tika/pipes/core/server/PipesServer.java |    44 +-
 .../tika/pipes/core/config/ConfigMergerTest.java   |    11 +-
 .../tika/pipes/core/protocol/PipesMessageTest.java |     2 +-
 .../apache/tika/pipes/fork/PipesForkParser.java    |     6 +-
 .../tika/pipes/fork/PipesForkParserConfig.java     |    22 +-
 .../tika/pipes/fork/PipesForkParserTest.java       |    52 +-
 .../filter/AttachmentCountingListFilter.java       |     3 +-
 .../apache/tika/pipes/core/CrashingDetector.java   |     2 +
 .../apache/tika/pipes/core/PipesClientTest.java    |    30 +-
 .../resources/configs/tika-config-bad-class.json   |     6 +-
 .../configs/tika-config-bad-java-path.json         |     6 +-
 .../configs/tika-config-bad-jvm-args.json          |     6 +-
 .../test/resources/configs/tika-config-basic.json  |     8 +-
 .../configs/tika-config-crashing-detector.json     |     8 +-
 .../resources/configs/tika-config-emit-all.json    |     6 +-
 .../resources/configs/tika-config-passback.json    |     8 +-
 .../configs/tika-config-shared-server.json         |     8 +-
 .../configs/tika-config-timeout-lt-heartbeat.json  |     6 +-
 .../resources/configs/tika-config-truncate.json    |     4 +-
 .../resources/configs/tika-config-uppercasing.json |     8 +-
 .../configs/tika-config-write-limiter.json         |     4 +-
 tika-pipes/tika-pipes-plugins/pom.xml              |     1 +
 .../fetcher/atlassianjwt/AtlassianJwtFetcher.java  |    14 +-
 .../atlassianjwt/AtlassianJwtFetcherFactory.java   |     4 +-
 .../config/AtlassianJwtFetcherConfig.java          |    40 +-
 .../pom.xml                                        |    27 +-
 .../src/main/assembly/assembly.xml                 |     0
 .../org/apache/tika/pipes/emitter/es/ESClient.java |   395 +
 .../apache/tika/pipes/emitter/es/ESEmitter.java}   |    89 +-
 .../tika/pipes/emitter/es/ESEmitterConfig.java     |    74 +
 .../tika/pipes/emitter/es/ESEmitterFactory.java}   |    22 +-
 .../tika/pipes/emitter/es/HttpClientConfig.java    |    39 +
 .../tika/pipes/emitter/es}/JsonResponse.java       |     3 +-
 .../tika/pipes/plugin/es/ESPipesPlugin.java}       |    14 +-
 .../tika/pipes/reporter/es/ESPipesReporter.java    |   251 +
 .../tika/pipes/reporter/es/ESReporterConfig.java}  |    17 +-
 .../tika/pipes/reporter/es/ESReporterFactory.java} |    24 +-
 .../src/main/resources/plugin.properties           |     6 +-
 .../apache/tika/pipes/emitter/es/ESClientTest.java |   165 +
 .../tika-pipes-google-drive/pom.xml                |     6 +-
 .../tika/pipes/fetcher/http/HttpFetcher.java       |    14 +-
 .../pipes/fetcher/http/HttpFetcherFactory.java     |     4 +-
 .../fetcher/http/config/HttpFetcherConfig.java     |    40 +-
 .../tika/pipes/fetcher/http/HttpFetcherTest.java   |     8 +-
 .../tika-pipes-microsoft-graph/pom.xml             |     4 +-
 .../pipes/emitter/opensearch/HttpClientConfig.java |     2 +-
 .../pipes/emitter/opensearch/OpenSearchClient.java |    43 +-
 .../emitter/opensearch/OpenSearchEmitter.java      |     4 +-
 .../reporter/opensearch/HttpClientConfig.java      |     2 +-
 .../opensearch/OpenSearchPipesReporter.java        |     4 +-
 .../tika/pipes/emitter/solr/SolrEmitter.java       |     8 +-
 .../tika/pipes/emitter/solr/SolrEmitterConfig.java |    12 +-
 .../pipes/iterator/solr/SolrPipesIterator.java     |     8 +-
 .../iterator/solr/SolrPipesIteratorConfig.java     |    20 +-
 .../tika/config/loader/ComponentInstantiator.java  |   157 +-
 .../config/loader/TikaObjectMapperFactory.java     |    29 +
 .../tika/serialization/ComponentNameResolver.java  |   104 +-
 .../tika/serialization/JsonMetadataList.java       |     2 +-
 .../tika/serialization/ParseContextUtils.java      |    36 +-
 .../org/apache/tika/serialization/TikaModule.java  |   238 +-
 .../serdes/ParseContextDeserializer.java           |    47 +-
 .../serdes/ParseContextSerializer.java             |    42 +-
 .../java/org/apache/tika/config/AllLimitsTest.java |    11 +-
 .../org/apache/tika/config/TimeoutLimitsTest.java  |    55 +-
 .../tika/config/loader/ConfigLoaderTest.java       |    24 +-
 .../filter/AttachmentCountingListFilter.java       |     3 +-
 .../CustomClassSerializationTest.java              |     2 +-
 .../serialization/RoundTripSerializationTest.java  |    38 +-
 .../TestParseContextSerialization.java             |    49 +-
 .../test/resources/configs/all-limits-test.json    |     3 +-
 .../test/resources/configs/test-config-loader.json |     2 +-
 .../resources/configs/test-partial-config.json     |     2 +-
 .../resources/configs/timeout-limits-test.json     |     3 +-
 tika-server/tika-server-core/pom.xml               |     2 +-
 .../apache/tika/server/core/TikaServerProcess.java |    33 +-
 .../server/core/resource/LanguageResource.java     |    24 +-
 .../tika/server/core/resource/PipesResource.java   |     4 +-
 .../server/core/resource/TranslateResource.java    |     4 +-
 .../org/apache/tika/server/core/CXFTestBase.java   |     8 +-
 .../tika/server/core/LanguageResourceTest.java     |    10 +-
 .../core/TikaServerPipesIntegrationTest.java       |     2 +-
 .../resources/configs/cxf-test-base-template.json  |     8 +-
 .../configs/cxf-unpack-test-template.json          |     4 +-
 .../configs/tika-config-server-basic.json          |     6 +-
 .../configs/tika-config-server-emitter.json        |     6 +-
 .../tika-config-server-fetcher-template.json       |     6 +-
 .../tika-config-server-fetchers-emitters.json      |     6 +-
 .../configs/tika-config-server-pipes-basic.json    |     6 +-
 .../tika-config-server-tls-one-way-template.json   |     6 +-
 .../tika-config-server-tls-two-way-template.json   |     6 +-
 .../resources/configs/tika-config-server-tls.json  |     6 +-
 .../test/resources/configs/tika-config-server.json |     6 +-
 .../configs/tika-config-timeout-100ms.json         |     4 +-
 .../configs/tika-config-with-timeout.json          |     4 +-
 .../src/test/resources/test-documents/english.txt  |     2 +-
 tika-server/tika-server-standard/pom.xml           |    18 +
 .../resources/configs/cxf-test-base-template.json  |     8 +-
 .../configs/tika-config-for-server-tests.json      |     6 +-
 .../test/resources/configs/tika-config-json.json   |     6 +-
 .../tika-config-langdetect-opennlp-filter.json     |     6 +-
 .../tika-config-langdetect-optimaize-filter.json   |     6 +-
 tika-translate/pom.xml                             |     4 +-
 .../translate/impl/AbstractTranslator.java         |    24 +-
 .../translate/impl/JoshuaNetworkTranslator.java    |     2 +-
 754 files changed, 3495444 insertions(+), 3453047 deletions(-)
 delete mode 100644 .java-version
 create mode 100644 
docs/modules/ROOT/pages/advanced/charset-detection-design.adoc
 create mode 100644 
docs/modules/ROOT/pages/advanced/charsoup-supported-languages.adoc
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/flores-AUTOMATIC.log
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/flores-SHORT_TEXT.log
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/flores-STANDARD.log
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/flores200-dev-eval.md
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/language-drop-decisions.md
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/short-text-language-decisions.md
 create mode 100644 
docs/modules/ROOT/pages/advanced/lang-detection/supported-languages.md
 create mode 100644 docs/modules/ROOT/pages/pipes/timeouts.adoc
 delete mode 100755 run-lang-train.sh
 delete mode 100644 tika-charset-detectors/pom.xml
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/BomEncodingDetector.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/ByteNgramFeatureExtractor.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/FeatureExtractor.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/HttpHeaderEncodingDetector.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/LinearModel.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/MlEncodingDetector.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding/Prediction.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html/StandardHtmlEncodingDetector.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/test/java/org/apache/tika/detect/encoding/ByteNgramFeatureExtractorTest.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-core/src/test/java/org/apache/tika/detect/encoding/CharsetConfusablesTest.java
 delete mode 100644 tika-charset-detectors/tika-charset-detectors-icu4j/pom.xml
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding/tools/EvalCharsetDetectors.java
 delete mode 100644 
tika-charset-detectors/tika-charset-detectors-tools/src/main/python/build_charset_training.py
 create mode 100644 
tika-core/src/main/java/org/apache/tika/config/TikaProgressTracker.java
 delete mode 100644 
tika-core/src/main/java/org/apache/tika/config/TikaTaskTimeout.java
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/txt
 => tika-core/src/main/java/org/apache/tika/detect}/BOMDetector.java (69%)
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
 create mode 100644 
tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
 rename {tika-charset-detectors/tika-charset-detectors-core => 
tika-core}/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (60%)
 create mode 100644 
tika-core/src/test/java/org/apache/tika/config/TikaProgressTrackerTest.java
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt
 => tika-core/src/test/java/org/apache/tika/detect}/BOMDetectorTest.java (91%)
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/customocr/org/apache/tika/parser/ocr/TesseractOCRConfig.properties
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-inline.xml
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/customocr/tika-config-rendered.xml
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/grobid/org/apache/tika/parser/journal/GrobidExtractor.properties
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/grobid/tika-config.xml
 delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ignite/README.md
 delete mode 100755 
tika-e2e-tests/tika-grpc/sample-configs/ner/run_tika_server.sh
 delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.json
 delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/ner/tika-config.xml
 delete mode 100644 tika-e2e-tests/tika-grpc/sample-configs/test-simple.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-caption.xml
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest-video.xml
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.json
 delete mode 100644 
tika-e2e-tests/tika-grpc/sample-configs/vision/inception-rest.xml
 delete mode 100644 
tika-e2e-tests/tika-grpc/src/test/java/org/apache/tika/pipes/ignite/README.md
 delete mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose-ignite.yml
 delete mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/docker-compose.yml
 delete mode 100644 tika-e2e-tests/tika-grpc/src/test/resources/log4j2.xml
 create mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.csv
 create mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.html
 create mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.txt
 create mode 100644 
tika-e2e-tests/tika-grpc/src/test/resources/test-fixtures/sample.xml
 copy tika-e2e-tests/tika-grpc/src/test/resources/{tika-config-ignite.json => 
tika-config-ignite-local.json} (90%)
 copy {tika-bundles => tika-encoding-detectors}/pom.xml (60%)
 rename {tika-charset-detectors/tika-charset-detectors-tools => 
tika-encoding-detectors/tika-encoding-detector-charsoup}/pom.xml (60%)
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 copy 
tika-annotation-processor/src/main/resources/META-INF/services/javax.annotation.processing.Processor
 => 
tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (64%)
 rename {tika-langdetect/tika-langdetect-charsoup => 
tika-encoding-detectors/tika-encoding-detector-charsoup}/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
 (79%)
 rename {tika-langdetect/tika-langdetect-charsoup => 
tika-encoding-detectors/tika-encoding-detector-charsoup}/src/test/java/org/apache/tika/langdetect/charsoup/TextQualityDiagTest.java
 (100%)
 rename {tika-charset-detectors/tika-charset-detectors-core => 
tika-encoding-detectors/tika-encoding-detector-html}/pom.xml (65%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser}/html/HtmlEncodingDetector.java
 (92%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/CharsetAliases.java
 (94%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/CharsetDetectionResult.java
 (97%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/MetaProcessor.java
 (95%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/PreScanner.java
 (93%)
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/charsets/ReplacementCharset.java
 (97%)
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/html
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector}/charsets/XUserDefinedCharset.java
 (97%)
 copy 
tika-core/src/main/resources/META-INF/services/org.apache.tika.metadata.filter.MetadataFilter
 => 
tika-encoding-detectors/tika-encoding-detector-html/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (91%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => tika-encoding-detectors/tika-encoding-detector-icu4j}/pom.xml (62%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetDetector.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetMatch.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_2022.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_UTF8.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_Unicode.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_mbcs.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecog_sbcs.java
 (100%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/main/java/org/apache/tika/parser/txt/CharsetRecognizer.java
 (100%)
 rename 
{tika-charset-detectors/tika-charset-detectors-icu4j/src/main/java/org/apache/tika/detect/encoding
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j/src/main/java/org/apache/tika/parser/txt}/Icu4jEncodingDetector.java
 (77%)
 rename 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
 (98%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/configs/tika-config-ignore-charset.json
 (100%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/multi-language.txt
 (100%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/resume.html
 (100%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/testIgnoreCharset.txt
 (100%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/testTXT_win-1252.txt
 (100%)
 copy 
{tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module
 => 
tika-encoding-detectors/tika-encoding-detector-icu4j}/src/test/resources/test-documents/test_ignore_IBM420.html
 (100%)
 copy {tika-ml/tika-ml-chardetect => 
tika-encoding-detectors/tika-encoding-detector-mojibuster}/pom.xml (70%)
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding
 => 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect}/CharsetConfusables.java
 (69%)
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/CjkEncodingRules.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 rename 
{tika-charset-detectors/tika-charset-detectors-core/src/main/java/org/apache/tika/detect/encoding
 => 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect}/StructuralEncodingRules.java
 (62%)
 copy 
tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser => 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 (71%)
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
 create mode 100644 
tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
 rename {tika-charset-detectors/tika-charset-detectors-universal => 
tika-encoding-detectors/tika-encoding-detector-universal}/pom.xml (64%)
 rename 
{tika-charset-detectors/tika-charset-detectors-universal/src/main/java/org/apache/tika/detect/encoding
 => 
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt}/UniversalEncodingDetector.java
 (53%)
 rename 
{tika-charset-detectors/tika-charset-detectors-universal/src/main/java/org/apache/tika/detect/encoding
 => 
tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt}/UniversalEncodingListener.java
 (99%)
 copy {tika-pipes/tika-pipes-fork-parser => 
tika-eval/tika-eval-app}/src/main/assembly/assembly.xml (92%)
 create mode 100644 
tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ace
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/aka
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/alt
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ami
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/arz
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ast
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ava
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/avk
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/azb
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bam
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/be-x-old
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bos
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bpy
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bua
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/bxr
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/cdo-x-rom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/cnh
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/cor
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/dag
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/eml
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/gla
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hak-x-rom
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hat
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hbs
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hif
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hil
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/hyw
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/jbo
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kaa
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kab
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kal
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kha
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/khk
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/khm
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/koi
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kom
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/kpv
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/krc
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lad
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lez
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lfn
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lmo
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/lup
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mai
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mri
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/mya
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nan
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nap
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nav
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ndo
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/new
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nqo
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/nya
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/olo
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/pms
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/prs
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/que
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/run
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sat
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/scn
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sco
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/skr
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/smi
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/smn
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/smo
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/sot
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/srd
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ssw
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/stq
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/szy
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tay
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tet
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tir
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/trv
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/tum
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/uzn
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/ven
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/vep
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/wuu
 create mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/yue
 delete mode 100644 
tika-eval/tika-eval-core/src/main/resources/common_tokens/zea
 copy tika-integration-tests/{tika-pipes-opensearch-integration-tests => 
tika-pipes-es-integration-tests}/pom.xml (85%)
 create mode 100644 
tika-integration-tests/tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTest.java
 copy 
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/java/org/apache/tika/pipes/opensearch/tests/OpensearchTestClient.java
 => 
tika-pipes-es-integration-tests/src/test/java/org/apache/tika/pipes/elasticsearch/tests/ElasticsearchTestClient.java}
 (72%)
 copy 
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-mappings.json
 => 
tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-mappings.json}
 (100%)
 copy 
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch/opensearch-parent-child-mappings.json
 => 
tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-parent-child-mappings.json}
 (100%)
 create mode 100644 
tika-integration-tests/tika-pipes-es-integration-tests/src/test/resources/elasticsearch/elasticsearch-vector-mappings.json
 copy 
tika-integration-tests/{tika-pipes-opensearch-integration-tests/src/test/resources/opensearch
 => 
tika-pipes-es-integration-tests/src/test/resources/elasticsearch}/plugins-template.json
 (78%)
 copy tika-integration-tests/{tika-pipes-opensearch-integration-tests => 
tika-pipes-es-integration-tests}/src/test/resources/pipes-fork-server-custom-log4j2.xml
 (100%)
 copy tika-integration-tests/{tika-pipes-opensearch-integration-tests => 
tika-pipes-es-integration-tests}/src/test/resources/test-documents/fake_oom.xml 
(100%)
 copy tika-integration-tests/{tika-pipes-opensearch-integration-tests => 
tika-pipes-es-integration-tests}/src/test/resources/test-documents/npe.xml 
(100%)
 copy tika-integration-tests/{tika-pipes-opensearch-integration-tests => 
tika-pipes-es-integration-tests}/src/test/resources/test-documents/oom.xml 
(100%)
 copy {tika-app/src/test/resources/test-data => 
tika-integration-tests/tika-pipes-es-integration-tests/src/test/resources/test-documents}/test_recursive_embedded.docx
 (100%)
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ShortTextFeatureExtractor.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-short-v1-20260310.bin
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect-v7-20260306.bin
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup-core/src/main/resources/org/apache/tika/langdetect/charsoup/langdetect.bin
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfig.java
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 copy 
tika-langdetect/{tika-langdetect-optimaize/src/main/java/org/apache/tika/langdetect/optimaize/metadatafilter/OptimaizeMetadataFilter.java
 => 
tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupMetadataFilter.java}
 (59%)
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/ConfusableGroups.java
 delete mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/LanguageConfusables.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/python/extract_madlad_to_wiki.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/main/resources/org/apache/tika/langdetect/charsoup/confusables.txt
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupDetectorConfigTest.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupModelRoutingTest.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/SjisLangSignalTest.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CalibrateConfidence.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ConfusionDump.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/DiagnoseUnknownScript.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/KoreanFalsePositives.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/PrepareCorpus.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/ResearchFeatureExtractor.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainShortModel.java
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/check_script_consistency.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/clean_madlad.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/collect_wikipedia.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/diagnose_kor_eng.py
 rename {tika-charset-detectors/tika-charset-detectors-tools/src/main => 
tika-langdetect/tika-langdetect-charsoup/src/test}/python/download_madlad.py 
(100%)
 create mode 100755 
tika-langdetect/tika-langdetect-charsoup/src/test/python/eval_fasttext.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_contamination.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/filter_uppercase.py
 create mode 100644 
tika-langdetect/tika-langdetect-charsoup/src/test/python/summarize_wikipedia.py
 delete mode 100644 tika-langdetect/tika-langdetect-tika/pom.xml
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageIdentifier.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfile.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilder.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/ProfilingWriter.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/java/org/apache/tika/langdetect/tika/TikaLanguageDetector.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/META-INF/services/org.apache.tika.language.detect.LanguageDetector
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/be.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ca.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/da.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/de.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/el.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/en.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/eo.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/es.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/et.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fa.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fi.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/fr.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/gl.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/hu.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/is.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/it.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/lt.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/nl.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/no.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pl.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/pt.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ro.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/ru.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sk.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sl.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/sv.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/th.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/tika.language.properties
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/main/resources/org/apache/tika/langdetect/tika/uk.ngp
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageIdentifierTest.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfileTest.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/LanguageProfilerBuilderTest.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingHandler.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/java/org/apache/tika/langdetect/tika/ProfilingWriterTest.java
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/da.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/de.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/el.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/en.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/es.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/et.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fi.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/fr.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/it.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/langbuilder/welsh_corpus.txt
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/lt.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/nl.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/pt.test
 delete mode 100644 
tika-langdetect/tika-langdetect-tika/src/test/resources/org/apache/tika/langdetect/tika/sv.test
 create mode 100644 tika-ml/tika-ml-chardetect/README.md
 delete mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/MlEncodingDetector.java
 rename 
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
 => 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/BenchmarkCharsetDetectors.java
 (94%)
 create mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BuildCharsetTrainingData.java
 create mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
 rename 
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
 => 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/DiagnoseCharsetDetector.java
 (87%)
 create mode 100644 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/EvalCharsetDetectors.java
 rename 
{tika-charset-detectors/tika-charset-detectors-tools/src/main/java/org/apache/tika/detect/encoding
 => 
tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect}/tools/TrainCharsetModel.java
 (65%)
 create mode 100644 tika-ml/tika-ml-chardetect/src/test/python/anneal.py
 delete mode 100644 
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupFeatureExtractor.java
 delete mode 100644 
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
 delete mode 100644 
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptCategory.java
 delete mode 100644 
tika-ml/tika-ml-core/src/main/java/org/apache/tika/langdetect/charsoup/WordTokenizer.java
 copy tika-parsers/{tika-parsers-ml/tika-parser-tess4j-module => 
tika-http-jdk}/pom.xml (71%)
 create mode 100644 
tika-parsers/tika-http-jdk/src/main/java/org/apache/tika/http/TikaHttpClient.java
 create mode 100644 
tika-parsers/tika-http-jdk/src/test/java/org/apache/tika/http/TikaTestHttpServer.java
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/pom.xml (67%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/AbstractVLMParser.java (80%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/ClaudeVLMParser.java (92%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/GeminiVLMParser.java (94%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitter.java 
(100%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/OpenAIVLMParser.java (93%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/main/java/org/apache/tika/parser/vlm/VLMOCRConfig.java (94%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/ClaudeVLMParserTest.java 
(82%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/GeminiVLMParserTest.java 
(81%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/MarkdownToXHTMLEmitterTest.java
 (100%)
 rename tika-parsers/tika-parsers-ml/{tika-parser-vlm-ocr-module => 
tika-vlm}/src/test/java/org/apache/tika/parser/vlm/OpenAIVLMParserTest.java 
(82%)
 create mode 100644 
tika-pipes/tika-async-cli/src/main/java/org/apache/tika/async/cli/FileListPipesIterator.java
 create mode 100644 
tika-pipes/tika-async-cli/src/test/java/org/apache/tika/async/cli/FileListPipesIteratorTest.java
 copy tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch => 
tika-pipes-es}/pom.xml (87%)
 copy tika-pipes/tika-pipes-plugins/{tika-pipes-atlassian-jwt => 
tika-pipes-es}/src/main/assembly/assembly.xml (100%)
 create mode 100644 
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESClient.java
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitter.java
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitter.java} 
(52%)
 create mode 100644 
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitterConfig.java
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch/OpenSearchEmitterFactory.java
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/ESEmitterFactory.java}
 (74%)
 create mode 100644 
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es/HttpClientConfig.java
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/emitter/opensearch
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/emitter/es}/JsonResponse.java 
(96%)
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-s3/src/main/java/org/apache/tika/pipes/plugin/s3/S3PipesPlugin.java
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/plugin/es/ESPipesPlugin.java} 
(77%)
 create mode 100644 
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESPipesReporter.java
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-opensearch/src/main/java/org/apache/tika/pipes/reporter/opensearch/OpenSearchReporterConfig.java
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESReporterConfig.java}
 (67%)
 copy 
tika-pipes/tika-pipes-plugins/{tika-pipes-jdbc/src/main/java/org/apache/tika/pipes/reporter/jdbc/JDBCPipesReporterFactory.java
 => 
tika-pipes-es/src/main/java/org/apache/tika/pipes/reporter/es/ESReporterFactory.java}
 (68%)
 copy tika-pipes/tika-pipes-plugins/{tika-pipes-az-blob => 
tika-pipes-es}/src/main/resources/plugin.properties (83%)
 create mode 100644 
tika-pipes/tika-pipes-plugins/tika-pipes-es/src/test/java/org/apache/tika/pipes/emitter/es/ESClientTest.java


Reply via email to