This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch chardet-work in repository https://gitbox.apache.org/repos/asf/tika.git
commit 5ba43d32cddedc63f78031edfc5ea5b92da1ae9a Merge: fd8561403a 8d8f3440f7 Author: tballison <[email protected]> AuthorDate: Fri Feb 27 08:45:30 2026 -0500 Merge origin/main Made-with: Cursor .github/workflows/main-jdk17-build.yml | 1 + .../main-jdk17-windows-build-multi-locale.yml | 1 + .github/workflows/main-jdk17-windows-build.yml | 1 + .github/workflows/main-jdk21-build.yml | 1 + .github/workflows/main-jdk25-build.yml | 1 + docs/modules/ROOT/examples/tess4j-basic.json | 10 + docs/modules/ROOT/examples/tess4j-full.json | 18 + docs/modules/ROOT/nav.adoc | 4 + .../pages/configuration/parsers/tess4j-parser.adoc | 282 +++++++++++ .../pages/migration-to-4x/chunk-strategies.adoc | 257 ++++++++++ .../pages/migration-to-4x/design-notes-4x.adoc | 2 +- .../inference-handler-requirements.adoc | 282 +++++++++++ docs/modules/ROOT/pages/pipes/index.adoc | 1 + .../ROOT/pages/pipes/shared-server-mode.adoc | 10 +- docs/modules/ROOT/pages/pipes/timeouts.adoc | 170 +++++++ tika-app/pom.xml | 12 + .../src/main/java/org/apache/tika/cli/TikaCLI.java | 4 +- .../test/resources/configs/config-template.json | 6 +- .../apache/tika/config/TikaProgressTracker.java | 82 ++++ .../org/apache/tika/config/TikaTaskTimeout.java | 76 --- .../java/org/apache/tika/config/TimeoutLimits.java | 112 +++-- .../tika/detect/CompositeEncodingDetector.java | 155 +++++- .../tika/detect/DefaultEncodingDetector.java | 27 +- .../tika/detect/EncodingDetectorContext.java | 105 +++++ .../apache/tika/detect/MetaEncodingDetector.java | 39 ++ .../apache/tika/detect/WideUnicodeDetector.java | 490 +++++++++++++++++++ .../tika/language/detect/LanguageResult.java | 28 ++ .../apache/tika/metadata/TikaCoreProperties.java | 20 +- .../java/org/apache/tika/parser/ParseContext.java | 15 +- .../tika/parser/external2/ExternalParser.java | 6 +- .../tika/sax/BasicContentHandlerFactory.java | 7 +- .../org/apache/tika/sax/ContentHandlerFactory.java | 15 + .../tika/sax/RecursiveParserWrapperHandler.java | 2 + .../tika/config/TikaProgressTrackerTest.java | 103 ++++ .../tika/detect/WideUnicodeDetectorTest.java | 452 ++++++++++++++++++ .../org/apache/tika/parser/ParseContextTest.java | 104 +++++ .../customocr/tika-config-inline.json | 1 - .../customocr/tika-config-rendered.json | 1 - .../sample-configs/grobid/tika-config.json | 1 - .../tika-grpc/sample-configs/ner/tika-config.json | 16 +- .../tika/example/PipesForkParserExample.java | 10 +- .../src/test/resources/tika-pipes-test-config.json | 6 +- .../src/test/resources/kafka/plugins-template.json | 1 - .../resources/opensearch/plugins-template.json | 3 +- .../opensearch/tika-config-opensearch.json | 3 +- .../src/test/resources/s3/plugins-template.json | 1 - .../src/test/resources/solr/plugins-template.json | 3 +- .../src/test/resources/tika-config-solr-urls.json | 1 - .../tika/langdetect/charsoup/CharSoupModel.java | 18 +- .../charsoup/CharSoupEncodingDetector.java | 186 ++++++++ .../charsoup/CharSoupLanguageDetector.java | 155 +++++- .../charsoup/CharSoupEncodingDetectorTest.java | 206 ++++++++ .../langdetect/charsoup/TextQualityDiagTest.java | 141 ++++++ tika-langdetect/tika-langdetect-optimaize/pom.xml | 2 +- tika-parent/pom.xml | 26 +- tika-parsers/pom.xml | 2 + .../tika-http-jdk}/pom.xml | 58 ++- .../java/org/apache/tika/http/TikaHttpClient.java | 150 ++++++ .../org/apache/tika/http/TikaTestHttpServer.java | 268 +++++++++++ .../org/apache/tika/parser/gdal/GDALParser.java | 6 +- tika-parsers/tika-parsers-ml/pom.xml | 4 +- .../tika-parsers-ml/tika-inference}/pom.xml | 60 ++- .../tika/inference/AbstractEmbeddingFilter.java | 234 ++++++++++ .../main/java/org/apache/tika/inference/Chunk.java | 88 ++++ .../org/apache/tika/inference/ChunkSerializer.java | 229 +++++++++ .../tika/inference/ImageEmbeddingConfig.java | 149 ++++++ .../org/apache/tika/inference/InferenceConfig.java | 260 +++++++++++ .../org/apache/tika/inference/MarkdownChunker.java | 210 +++++++++ .../tika/inference/OpenAIEmbeddingFilter.java | 185 ++++++++ .../tika/inference/OpenAIImageEmbeddingParser.java | 396 ++++++++++++++++ .../apache/tika/inference/VectorSerializer.java | 52 +++ .../apache/tika/inference/locator/Locators.java | 121 +++++ .../tika/inference/locator/PaginatedLocator.java | 56 +++ .../tika/inference/locator/SpatialLocator.java | 54 +++ .../tika/inference/locator/TemporalLocator.java | 34 +- .../apache/tika/inference/locator/TextLocator.java | 33 +- .../apache/tika/inference/ChunkSerializerTest.java | 183 ++++++++ .../apache/tika/inference/MarkdownChunkerTest.java | 138 ++++++ .../tika/inference/OpenAIEmbeddingFilterTest.java | 256 ++++++++++ .../inference/OpenAIImageEmbeddingParserTest.java | 306 ++++++++++++ .../tika/inference/VectorSerializerTest.java | 58 +++ .../apache/tika/parser/ner/NamedEntityParser.java | 2 + .../src/test/resources/configs/tika-config.json | 2 +- .../tika-parser-tess4j-module}/pom.xml | 52 ++- .../tika/parser/ocr/tess4j/Tess4JConfig.java | 355 ++++++++++++++ .../tika/parser/ocr/tess4j/Tess4JParser.java | 518 +++++++++++++++++++++ .../tika/parser/ocr/tess4j/Tess4JConfigTest.java | 140 ++++++ .../tika/parser/ocr/tess4j/Tess4JParserTest.java | 255 ++++++++++ .../src/test/resources/test-documents/testOCR.jpg | Bin 0 -> 3408 bytes .../pom.xml | 53 +-- .../apache/tika/parser/vlm/AbstractVLMParser.java | 107 ++--- .../apache/tika/parser/vlm/ClaudeVLMParser.java | 21 +- .../apache/tika/parser/vlm/GeminiVLMParser.java | 14 +- .../tika/parser/vlm/MarkdownToXHTMLEmitter.java | 0 .../apache/tika/parser/vlm/OpenAIVLMParser.java | 19 +- .../org/apache/tika/parser/vlm/VLMOCRConfig.java | 0 .../tika/parser/vlm/ClaudeVLMParserTest.java | 62 ++- .../tika/parser/vlm/GeminiVLMParserTest.java | 64 ++- .../parser/vlm/MarkdownToXHTMLEmitterTest.java | 0 .../tika/parser/vlm/OpenAIVLMParserTest.java | 65 ++- .../apache/tika/parser/ocr/TesseractOCRParser.java | 6 +- .../tika/parser/ocr/TesseractOCRParserTest.java | 4 +- .../apache/tika/parser/strings/StringsParser.java | 11 +- .../tika-parsers-standard-package/pom.xml | 6 + .../tika/config/TikaEncodingDetectorTest.java | 49 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 32 +- ...KA-4671-exclude-charsoup-encoding-detector.json | 11 + .../testArabicMisleadingCharset.html | 11 + .../org/apache/tika/async/cli/PluginsWriter.java | 15 +- .../apache/tika/async/cli/AsyncProcessorTest.java | 4 + .../test/resources/configs/config-template.json | 6 +- tika-pipes/tika-pipes-api/pom.xml | 19 + .../java/org/apache/tika/pipes/api/ParseMode.java | 3 + .../tika/pipes/core/PerClientServerManager.java | 20 +- .../org/apache/tika/pipes/core/PipesClient.java | 194 +++----- .../org/apache/tika/pipes/core/PipesConfig.java | 15 - .../tika/pipes/core/config/ConfigMerger.java | 22 +- .../tika/pipes/core/config/ConfigOverrides.java | 39 +- .../tika/pipes/core/protocol/PipesMessage.java | 173 +++++++ .../tika/pipes/core/protocol/PipesMessageType.java | 96 ++++ .../core/protocol/ProtocolDesyncException.java} | 20 +- .../core/protocol/ShutDownReceivedException.java} | 24 +- .../serialization/FetchEmitTupleDeserializer.java | 8 - .../tika/pipes/core/server/ConnectionHandler.java | 299 +++++------- .../tika/pipes/core/server/ParseHandler.java | 2 + .../apache/tika/pipes/core/server/PipesServer.java | 389 ++++++---------- .../tika/pipes/core/server/ServerProtocolIO.java | 133 ++++++ .../tika/pipes/core/config/ConfigMergerTest.java | 11 +- .../tika/pipes/core/protocol/PipesMessageTest.java | 202 ++++++++ .../apache/tika/pipes/fork/PipesForkParser.java | 6 +- .../tika/pipes/fork/PipesForkParserConfig.java | 22 +- .../tika/pipes/fork/PipesForkParserTest.java | 52 +-- .../apache/tika/pipes/core/CrashingDetector.java | 2 + .../apache/tika/pipes/core/PipesClientTest.java | 91 +++- .../resources/configs/tika-config-bad-class.json | 6 +- .../configs/tika-config-bad-java-path.json | 6 +- .../configs/tika-config-bad-jvm-args.json | 6 +- .../test/resources/configs/tika-config-basic.json | 8 +- .../configs/tika-config-crashing-detector.json | 8 +- .../resources/configs/tika-config-emit-all.json | 6 +- .../resources/configs/tika-config-passback.json | 8 +- .../configs/tika-config-shared-server.json | 8 +- .../configs/tika-config-timeout-lt-heartbeat.json | 6 +- .../resources/configs/tika-config-truncate.json | 4 +- .../resources/configs/tika-config-uppercasing.json | 8 +- .../configs/tika-config-write-limiter.json | 4 +- .../tika-pipes-google-drive/pom.xml | 2 +- .../tika-pipes-plugins/tika-pipes-http/pom.xml | 69 ++- .../test/resources/configs/tika-config-http.json | 2 +- .../tika/config/loader/ComponentInstantiator.java | 157 ++++++- .../org/apache/tika/config/loader/TikaLoader.java | 8 +- .../config/loader/TikaObjectMapperFactory.java | 29 ++ .../tika/serialization/ComponentNameResolver.java | 104 ++++- .../tika/serialization/ParseContextUtils.java | 36 +- .../org/apache/tika/serialization/TikaModule.java | 238 +--------- .../serdes/ParseContextDeserializer.java | 47 +- .../serdes/ParseContextSerializer.java | 42 +- .../java/org/apache/tika/config/AllLimitsTest.java | 11 +- .../org/apache/tika/config/TimeoutLimitsTest.java | 55 ++- .../tika/config/loader/ConfigLoaderTest.java | 24 +- .../serialization/RoundTripSerializationTest.java | 38 +- .../TestParseContextSerialization.java | 49 +- .../test/resources/configs/all-limits-test.json | 3 +- .../test/resources/configs/test-config-loader.json | 2 +- .../resources/configs/test-partial-config.json | 2 +- .../resources/configs/timeout-limits-test.json | 3 +- .../apache/tika/server/core/TikaServerProcess.java | 4 +- .../org/apache/tika/server/core/CXFTestBase.java | 8 +- .../core/TikaServerPipesIntegrationTest.java | 2 +- .../resources/configs/cxf-test-base-template.json | 8 +- .../configs/cxf-unpack-test-template.json | 4 +- .../configs/tika-config-server-basic.json | 6 +- .../configs/tika-config-server-emitter.json | 6 +- .../tika-config-server-fetcher-template.json | 6 +- .../tika-config-server-fetchers-emitters.json | 6 +- .../configs/tika-config-server-pipes-basic.json | 6 +- .../tika-config-server-tls-one-way-template.json | 6 +- .../tika-config-server-tls-two-way-template.json | 6 +- .../resources/configs/tika-config-server-tls.json | 6 +- .../test/resources/configs/tika-config-server.json | 6 +- .../configs/tika-config-timeout-100ms.json | 4 +- .../configs/tika-config-with-timeout.json | 4 +- tika-server/tika-server-standard/pom.xml | 12 + .../resources/configs/cxf-test-base-template.json | 8 +- .../configs/tika-config-for-server-tests.json | 6 +- .../test/resources/configs/tika-config-json.json | 6 +- .../tika-config-langdetect-opennlp-filter.json | 6 +- .../tika-config-langdetect-optimaize-filter.json | 6 +- 188 files changed, 10386 insertions(+), 1743 deletions(-)
