This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch chardet-work
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 5ba43d32cddedc63f78031edfc5ea5b92da1ae9a
Merge: fd8561403a 8d8f3440f7
Author: tballison <[email protected]>
AuthorDate: Fri Feb 27 08:45:30 2026 -0500

    Merge origin/main
    
    Made-with: Cursor

 .github/workflows/main-jdk17-build.yml             |   1 +
 .../main-jdk17-windows-build-multi-locale.yml      |   1 +
 .github/workflows/main-jdk17-windows-build.yml     |   1 +
 .github/workflows/main-jdk21-build.yml             |   1 +
 .github/workflows/main-jdk25-build.yml             |   1 +
 docs/modules/ROOT/examples/tess4j-basic.json       |  10 +
 docs/modules/ROOT/examples/tess4j-full.json        |  18 +
 docs/modules/ROOT/nav.adoc                         |   4 +
 .../pages/configuration/parsers/tess4j-parser.adoc | 282 +++++++++++
 .../pages/migration-to-4x/chunk-strategies.adoc    | 257 ++++++++++
 .../pages/migration-to-4x/design-notes-4x.adoc     |   2 +-
 .../inference-handler-requirements.adoc            | 282 +++++++++++
 docs/modules/ROOT/pages/pipes/index.adoc           |   1 +
 .../ROOT/pages/pipes/shared-server-mode.adoc       |  10 +-
 docs/modules/ROOT/pages/pipes/timeouts.adoc        | 170 +++++++
 tika-app/pom.xml                                   |  12 +
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   4 +-
 .../test/resources/configs/config-template.json    |   6 +-
 .../apache/tika/config/TikaProgressTracker.java    |  82 ++++
 .../org/apache/tika/config/TikaTaskTimeout.java    |  76 ---
 .../java/org/apache/tika/config/TimeoutLimits.java | 112 +++--
 .../tika/detect/CompositeEncodingDetector.java     | 155 +++++-
 .../tika/detect/DefaultEncodingDetector.java       |  27 +-
 .../tika/detect/EncodingDetectorContext.java       | 105 +++++
 .../apache/tika/detect/MetaEncodingDetector.java   |  39 ++
 .../apache/tika/detect/WideUnicodeDetector.java    | 490 +++++++++++++++++++
 .../tika/language/detect/LanguageResult.java       |  28 ++
 .../apache/tika/metadata/TikaCoreProperties.java   |  20 +-
 .../java/org/apache/tika/parser/ParseContext.java  |  15 +-
 .../tika/parser/external2/ExternalParser.java      |   6 +-
 .../tika/sax/BasicContentHandlerFactory.java       |   7 +-
 .../org/apache/tika/sax/ContentHandlerFactory.java |  15 +
 .../tika/sax/RecursiveParserWrapperHandler.java    |   2 +
 .../tika/config/TikaProgressTrackerTest.java       | 103 ++++
 .../tika/detect/WideUnicodeDetectorTest.java       | 452 ++++++++++++++++++
 .../org/apache/tika/parser/ParseContextTest.java   | 104 +++++
 .../customocr/tika-config-inline.json              |   1 -
 .../customocr/tika-config-rendered.json            |   1 -
 .../sample-configs/grobid/tika-config.json         |   1 -
 .../tika-grpc/sample-configs/ner/tika-config.json  |  16 +-
 .../tika/example/PipesForkParserExample.java       |  10 +-
 .../src/test/resources/tika-pipes-test-config.json |   6 +-
 .../src/test/resources/kafka/plugins-template.json |   1 -
 .../resources/opensearch/plugins-template.json     |   3 +-
 .../opensearch/tika-config-opensearch.json         |   3 +-
 .../src/test/resources/s3/plugins-template.json    |   1 -
 .../src/test/resources/solr/plugins-template.json  |   3 +-
 .../src/test/resources/tika-config-solr-urls.json  |   1 -
 .../tika/langdetect/charsoup/CharSoupModel.java    |  18 +-
 .../charsoup/CharSoupEncodingDetector.java         | 186 ++++++++
 .../charsoup/CharSoupLanguageDetector.java         | 155 +++++-
 .../charsoup/CharSoupEncodingDetectorTest.java     | 206 ++++++++
 .../langdetect/charsoup/TextQualityDiagTest.java   | 141 ++++++
 tika-langdetect/tika-langdetect-optimaize/pom.xml  |   2 +-
 tika-parent/pom.xml                                |  26 +-
 tika-parsers/pom.xml                               |   2 +
 .../tika-http-jdk}/pom.xml                         |  58 ++-
 .../java/org/apache/tika/http/TikaHttpClient.java  | 150 ++++++
 .../org/apache/tika/http/TikaTestHttpServer.java   | 268 +++++++++++
 .../org/apache/tika/parser/gdal/GDALParser.java    |   6 +-
 tika-parsers/tika-parsers-ml/pom.xml               |   4 +-
 .../tika-parsers-ml/tika-inference}/pom.xml        |  60 ++-
 .../tika/inference/AbstractEmbeddingFilter.java    | 234 ++++++++++
 .../main/java/org/apache/tika/inference/Chunk.java |  88 ++++
 .../org/apache/tika/inference/ChunkSerializer.java | 229 +++++++++
 .../tika/inference/ImageEmbeddingConfig.java       | 149 ++++++
 .../org/apache/tika/inference/InferenceConfig.java | 260 +++++++++++
 .../org/apache/tika/inference/MarkdownChunker.java | 210 +++++++++
 .../tika/inference/OpenAIEmbeddingFilter.java      | 185 ++++++++
 .../tika/inference/OpenAIImageEmbeddingParser.java | 396 ++++++++++++++++
 .../apache/tika/inference/VectorSerializer.java    |  52 +++
 .../apache/tika/inference/locator/Locators.java    | 121 +++++
 .../tika/inference/locator/PaginatedLocator.java   |  56 +++
 .../tika/inference/locator/SpatialLocator.java     |  54 +++
 .../tika/inference/locator/TemporalLocator.java    |  34 +-
 .../apache/tika/inference/locator/TextLocator.java |  33 +-
 .../apache/tika/inference/ChunkSerializerTest.java | 183 ++++++++
 .../apache/tika/inference/MarkdownChunkerTest.java | 138 ++++++
 .../tika/inference/OpenAIEmbeddingFilterTest.java  | 256 ++++++++++
 .../inference/OpenAIImageEmbeddingParserTest.java  | 306 ++++++++++++
 .../tika/inference/VectorSerializerTest.java       |  58 +++
 .../apache/tika/parser/ner/NamedEntityParser.java  |   2 +
 .../src/test/resources/configs/tika-config.json    |   2 +-
 .../tika-parser-tess4j-module}/pom.xml             |  52 ++-
 .../tika/parser/ocr/tess4j/Tess4JConfig.java       | 355 ++++++++++++++
 .../tika/parser/ocr/tess4j/Tess4JParser.java       | 518 +++++++++++++++++++++
 .../tika/parser/ocr/tess4j/Tess4JConfigTest.java   | 140 ++++++
 .../tika/parser/ocr/tess4j/Tess4JParserTest.java   | 255 ++++++++++
 .../src/test/resources/test-documents/testOCR.jpg  | Bin 0 -> 3408 bytes
 .../pom.xml                                        |  53 +--
 .../apache/tika/parser/vlm/AbstractVLMParser.java  | 107 ++---
 .../apache/tika/parser/vlm/ClaudeVLMParser.java    |  21 +-
 .../apache/tika/parser/vlm/GeminiVLMParser.java    |  14 +-
 .../tika/parser/vlm/MarkdownToXHTMLEmitter.java    |   0
 .../apache/tika/parser/vlm/OpenAIVLMParser.java    |  19 +-
 .../org/apache/tika/parser/vlm/VLMOCRConfig.java   |   0
 .../tika/parser/vlm/ClaudeVLMParserTest.java       |  62 ++-
 .../tika/parser/vlm/GeminiVLMParserTest.java       |  64 ++-
 .../parser/vlm/MarkdownToXHTMLEmitterTest.java     |   0
 .../tika/parser/vlm/OpenAIVLMParserTest.java       |  65 ++-
 .../apache/tika/parser/ocr/TesseractOCRParser.java |   6 +-
 .../tika/parser/ocr/TesseractOCRParserTest.java    |   4 +-
 .../apache/tika/parser/strings/StringsParser.java  |  11 +-
 .../tika-parsers-standard-package/pom.xml          |   6 +
 .../tika/config/TikaEncodingDetectorTest.java      |  49 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  32 +-
 ...KA-4671-exclude-charsoup-encoding-detector.json |  11 +
 .../testArabicMisleadingCharset.html               |  11 +
 .../org/apache/tika/async/cli/PluginsWriter.java   |  15 +-
 .../apache/tika/async/cli/AsyncProcessorTest.java  |   4 +
 .../test/resources/configs/config-template.json    |   6 +-
 tika-pipes/tika-pipes-api/pom.xml                  |  19 +
 .../java/org/apache/tika/pipes/api/ParseMode.java  |   3 +
 .../tika/pipes/core/PerClientServerManager.java    |  20 +-
 .../org/apache/tika/pipes/core/PipesClient.java    | 194 +++-----
 .../org/apache/tika/pipes/core/PipesConfig.java    |  15 -
 .../tika/pipes/core/config/ConfigMerger.java       |  22 +-
 .../tika/pipes/core/config/ConfigOverrides.java    |  39 +-
 .../tika/pipes/core/protocol/PipesMessage.java     | 173 +++++++
 .../tika/pipes/core/protocol/PipesMessageType.java |  96 ++++
 .../core/protocol/ProtocolDesyncException.java}    |  20 +-
 .../core/protocol/ShutDownReceivedException.java}  |  24 +-
 .../serialization/FetchEmitTupleDeserializer.java  |   8 -
 .../tika/pipes/core/server/ConnectionHandler.java  | 299 +++++-------
 .../tika/pipes/core/server/ParseHandler.java       |   2 +
 .../apache/tika/pipes/core/server/PipesServer.java | 389 ++++++----------
 .../tika/pipes/core/server/ServerProtocolIO.java   | 133 ++++++
 .../tika/pipes/core/config/ConfigMergerTest.java   |  11 +-
 .../tika/pipes/core/protocol/PipesMessageTest.java | 202 ++++++++
 .../apache/tika/pipes/fork/PipesForkParser.java    |   6 +-
 .../tika/pipes/fork/PipesForkParserConfig.java     |  22 +-
 .../tika/pipes/fork/PipesForkParserTest.java       |  52 +--
 .../apache/tika/pipes/core/CrashingDetector.java   |   2 +
 .../apache/tika/pipes/core/PipesClientTest.java    |  91 +++-
 .../resources/configs/tika-config-bad-class.json   |   6 +-
 .../configs/tika-config-bad-java-path.json         |   6 +-
 .../configs/tika-config-bad-jvm-args.json          |   6 +-
 .../test/resources/configs/tika-config-basic.json  |   8 +-
 .../configs/tika-config-crashing-detector.json     |   8 +-
 .../resources/configs/tika-config-emit-all.json    |   6 +-
 .../resources/configs/tika-config-passback.json    |   8 +-
 .../configs/tika-config-shared-server.json         |   8 +-
 .../configs/tika-config-timeout-lt-heartbeat.json  |   6 +-
 .../resources/configs/tika-config-truncate.json    |   4 +-
 .../resources/configs/tika-config-uppercasing.json |   8 +-
 .../configs/tika-config-write-limiter.json         |   4 +-
 .../tika-pipes-google-drive/pom.xml                |   2 +-
 .../tika-pipes-plugins/tika-pipes-http/pom.xml     |  69 ++-
 .../test/resources/configs/tika-config-http.json   |   2 +-
 .../tika/config/loader/ComponentInstantiator.java  | 157 ++++++-
 .../org/apache/tika/config/loader/TikaLoader.java  |   8 +-
 .../config/loader/TikaObjectMapperFactory.java     |  29 ++
 .../tika/serialization/ComponentNameResolver.java  | 104 ++++-
 .../tika/serialization/ParseContextUtils.java      |  36 +-
 .../org/apache/tika/serialization/TikaModule.java  | 238 +---------
 .../serdes/ParseContextDeserializer.java           |  47 +-
 .../serdes/ParseContextSerializer.java             |  42 +-
 .../java/org/apache/tika/config/AllLimitsTest.java |  11 +-
 .../org/apache/tika/config/TimeoutLimitsTest.java  |  55 ++-
 .../tika/config/loader/ConfigLoaderTest.java       |  24 +-
 .../serialization/RoundTripSerializationTest.java  |  38 +-
 .../TestParseContextSerialization.java             |  49 +-
 .../test/resources/configs/all-limits-test.json    |   3 +-
 .../test/resources/configs/test-config-loader.json |   2 +-
 .../resources/configs/test-partial-config.json     |   2 +-
 .../resources/configs/timeout-limits-test.json     |   3 +-
 .../apache/tika/server/core/TikaServerProcess.java |   4 +-
 .../org/apache/tika/server/core/CXFTestBase.java   |   8 +-
 .../core/TikaServerPipesIntegrationTest.java       |   2 +-
 .../resources/configs/cxf-test-base-template.json  |   8 +-
 .../configs/cxf-unpack-test-template.json          |   4 +-
 .../configs/tika-config-server-basic.json          |   6 +-
 .../configs/tika-config-server-emitter.json        |   6 +-
 .../tika-config-server-fetcher-template.json       |   6 +-
 .../tika-config-server-fetchers-emitters.json      |   6 +-
 .../configs/tika-config-server-pipes-basic.json    |   6 +-
 .../tika-config-server-tls-one-way-template.json   |   6 +-
 .../tika-config-server-tls-two-way-template.json   |   6 +-
 .../resources/configs/tika-config-server-tls.json  |   6 +-
 .../test/resources/configs/tika-config-server.json |   6 +-
 .../configs/tika-config-timeout-100ms.json         |   4 +-
 .../configs/tika-config-with-timeout.json          |   4 +-
 tika-server/tika-server-standard/pom.xml           |  12 +
 .../resources/configs/cxf-test-base-template.json  |   8 +-
 .../configs/tika-config-for-server-tests.json      |   6 +-
 .../test/resources/configs/tika-config-json.json   |   6 +-
 .../tika-config-langdetect-opennlp-filter.json     |   6 +-
 .../tika-config-langdetect-optimaize-filter.json   |   6 +-
 188 files changed, 10386 insertions(+), 1743 deletions(-)

Reply via email to