This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit d370f5424711096db2eaa7ad91355fe4f9499ee3 Author: tallison <[email protected]> AuthorDate: Sun Apr 12 16:26:14 2026 -0400 bump limit to something realistic --- .../org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 9cc1f7cf32..a0c56393c8 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -169,8 +169,12 @@ public class MojibusterEncodingDetector implements EncodingDetector { LABEL_TO_JAVA_NAME = Collections.unmodifiableMap(m); } - /** Default number of bytes read from the stream for detection. */ - public static final int MAX_PROBE_BYTES = 4096; + /** + * Default number of bytes read from the stream for detection. + * Set generously so HTML/XML probes reach body text past + * ASCII-heavy head / script sections. + */ + public static final int MAX_PROBE_BYTES = 32768; /** * JSON-deserializable configuration for {@link MojibusterEncodingDetector}.
