(tika) 05/06: gate UTF-16 model output on 2-byte column-diversity asymmetry

tallison Mon, 13 Apr 2026 10:28:29 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


commit d28c9b1f7107192beb3f6ec2fcf0f5bbf5a03502
Author: tallison <[email protected]>
AuthorDate: Sun Apr 12 16:36:48 2026 -0400

    gate UTF-16 model output on 2-byte column-diversity asymmetry
---
 .../ml/chardetect/MojibusterEncodingDetector.java  | 13 +++-
 .../ml/chardetect/StructuralEncodingRules.java     | 78 ++++++++++++++++++++++
 2 files changed, 90 insertions(+), 1 deletion(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index a0c56393c8..3a253f42d3 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -430,8 +430,19 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         boolean excludeUtf8 = gates
                 && StructuralEncodingRules.checkUtf8(probe) == 
StructuralEncodingRules.Utf8Result.NOT_UTF8;
 
+        // UTF-16 structural gate: stride-2 bigram features can misfire on
+        // non-UTF-16 probes with scattered nulls (e.g. Greek plaintext with
+        // 0.3% nulls scoring as UTF-16-LE). Real UTF-16 of any script has a
+        // concentrated byte column paired with a diverse one; scattered nulls
+        // produce ~balanced column diversity. Mask UTF-16 labels when the
+        // column-asymmetry test fails.
+        boolean utf16Plausible = !gates
+                || StructuralEncodingRules.has2ByteColumnAsymmetry(probe);
+        boolean excludeUtf16Be = wideResult.invalidUtf16Be || !utf16Plausible;
+        boolean excludeUtf16Le = wideResult.invalidUtf16Le || !utf16Plausible;
+
         List<EncodingResult> results = runModel(probe, excludeUtf8,
-                wideResult.invalidUtf16Be, wideResult.invalidUtf16Le, topN);
+                excludeUtf16Be, excludeUtf16Le, topN);
 
         // If the model had no evidence (probe too short or all tokens 
filtered), fall back to
         // windows-1252 at very low confidence rather than returning empty and 
letting
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index 101e00484c..beaffc7475 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -255,6 +255,84 @@ public final class StructuralEncodingRules {
         return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3;
     }
 
+    /**
+     * Minimum probe length before {@link #has2ByteColumnAsymmetry} produces
+     * meaningful diversity counts.  Short probes or probes with limited
+     * vocabulary may have too few distinct byte values per column to compare
+     * reliably; on anything below this threshold we fall back to the pre-gate
+     * behaviour (model + {@link WideUnicodeDetector} positive signal).  Set
+     * above the size of typical short probes (a few hundred bytes) so real
+     * CJK UTF-16 text has room to diversify its high-byte column.
+     */
+    public static final int MIN_COLUMN_ASYMMETRY_PROBE = 2048;
+
+    /**
+     * Returns {@code true} if the probe's byte distribution across stride-2
+     * columns is sufficiently asymmetric to be plausible UTF-16 of some 
script.
+     *
+     * <p>Every UTF-16 variant has one byte column concentrated in a
+     * script-specific Unicode block prefix while the other column is diverse:
+     * UTF-16 Latin pairs to {@code (ascii, 0x00)} so one column is {@code 
0x00}
+     * (1 value) vs ASCII range (~70 values); UTF-16 Cyrillic / Greek / Arabic 
/
+     * Hebrew pair to a single high-byte block prefix ({@code 0x04}, {@code 
0x03},
+     * {@code 0x06}, {@code 0x05}); UTF-16 CJK Unified uses {@code 
0x4E}-{@code 0x9F}
+     * (~80 distinct high bytes) against ~256 low bytes; Hangul uses
+     * {@code 0xAC}-{@code 0xD7} (~44 high bytes).</p>
+     *
+     * <p>Non-UTF-16 text — including scattered-null binaries and mixed-content
+     * files — has roughly balanced column diversity (both columns saturate
+     * near 256 distinct byte values on long probes).</p>
+     *
+     * <p>This is a <em>negative</em> gate: when it returns {@code false}, the
+     * probe cannot be any UTF-16 variant, and UTF-16 labels should be masked
+     * from model output even when the stride-2 bigram features score them
+     * highly (e.g. a Greek plaintext file with 0.36% scattered nulls being
+     * mis-scored as UTF-16-LE).</p>
+     *
+     * <p>A diversity ratio of 3× (more diverse column has at least 3× as many
+     * distinct values as the more concentrated column) admits all UTF-16
+     * variants including CJK (ratio ~3.2) while rejecting scattered-null
+     * false positives (ratio ~1:1).</p>
+     *
+     * <p>For probes shorter than {@link #MIN_COLUMN_ASYMMETRY_PROBE}, this
+     * method returns {@code true} conservatively — column counts from short
+     * samples are not statistically meaningful, so the caller should rely on
+     * {@link WideUnicodeDetector} positive signal and downstream CharSoup
+     * arbitration rather than masking.</p>
+     *
+     * @param bytes the probe to analyse
+     * @return {@code true} if the probe has UTF-16-compatible column asymmetry
+     *         (or is too short to judge); {@code false} if column diversity is
+     *         too balanced to be any UTF-16 variant
+     */
+    public static boolean has2ByteColumnAsymmetry(byte[] bytes) {
+        if (bytes == null || bytes.length < MIN_COLUMN_ASYMMETRY_PROBE) {
+            return true;
+        }
+        int sample = Math.min(bytes.length, 4096);
+        boolean[] evenSeen = new boolean[256];
+        boolean[] oddSeen = new boolean[256];
+        int evenDistinct = 0;
+        int oddDistinct = 0;
+        for (int i = 0; i < sample; i++) {
+            int v = bytes[i] & 0xFF;
+            if ((i & 1) == 0) {
+                if (!evenSeen[v]) {
+                    evenSeen[v] = true;
+                    evenDistinct++;
+                }
+            } else {
+                if (!oddSeen[v]) {
+                    oddSeen[v] = true;
+                    oddDistinct++;
+                }
+            }
+        }
+        int min = Math.min(evenDistinct, oddDistinct);
+        int max = Math.max(evenDistinct, oddDistinct);
+        return max >= min * 3;
+    }
+
     public static boolean checkIbm424(byte[] bytes, int offset, int length) {
         if (length < 8) {
             return false;

(tika) 05/06: gate UTF-16 model output on 2-byte column-diversity asymmetry

Reply via email to