This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d370f5424711096db2eaa7ad91355fe4f9499ee3
Author: tallison <[email protected]>
AuthorDate: Sun Apr 12 16:26:14 2026 -0400

    bump limit to something realistic
---
 .../org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 9cc1f7cf32..a0c56393c8 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -169,8 +169,12 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         LABEL_TO_JAVA_NAME = Collections.unmodifiableMap(m);
     }
 
-    /** Default number of bytes read from the stream for detection. */
-    public static final int MAX_PROBE_BYTES = 4096;
+    /**
+     * Default number of bytes read from the stream for detection.
+     * Set generously so HTML/XML probes reach body text past
+     * ASCII-heavy head / script sections.
+     */
+    public static final int MAX_PROBE_BYTES = 32768;
 
     /**
      * JSON-deserializable configuration for {@link 
MojibusterEncodingDetector}.

Reply via email to