This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit d28c9b1f7107192beb3f6ec2fcf0f5bbf5a03502 Author: tallison <[email protected]> AuthorDate: Sun Apr 12 16:36:48 2026 -0400 gate UTF-16 model output on 2-byte column-diversity asymmetry --- .../ml/chardetect/MojibusterEncodingDetector.java | 13 +++- .../ml/chardetect/StructuralEncodingRules.java | 78 ++++++++++++++++++++++ 2 files changed, 90 insertions(+), 1 deletion(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index a0c56393c8..3a253f42d3 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -430,8 +430,19 @@ public class MojibusterEncodingDetector implements EncodingDetector { boolean excludeUtf8 = gates && StructuralEncodingRules.checkUtf8(probe) == StructuralEncodingRules.Utf8Result.NOT_UTF8; + // UTF-16 structural gate: stride-2 bigram features can misfire on + // non-UTF-16 probes with scattered nulls (e.g. Greek plaintext with + // 0.3% nulls scoring as UTF-16-LE). Real UTF-16 of any script has a + // concentrated byte column paired with a diverse one; scattered nulls + // produce ~balanced column diversity. Mask UTF-16 labels when the + // column-asymmetry test fails. + boolean utf16Plausible = !gates + || StructuralEncodingRules.has2ByteColumnAsymmetry(probe); + boolean excludeUtf16Be = wideResult.invalidUtf16Be || !utf16Plausible; + boolean excludeUtf16Le = wideResult.invalidUtf16Le || !utf16Plausible; + List<EncodingResult> results = runModel(probe, excludeUtf8, - wideResult.invalidUtf16Be, wideResult.invalidUtf16Le, topN); + excludeUtf16Be, excludeUtf16Le, topN); // If the model had no evidence (probe too short or all tokens filtered), fall back to // windows-1252 at very low confidence rather than returning empty and letting diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java index 101e00484c..beaffc7475 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java @@ -255,6 +255,84 @@ public final class StructuralEncodingRules { return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3; } + /** + * Minimum probe length before {@link #has2ByteColumnAsymmetry} produces + * meaningful diversity counts. Short probes or probes with limited + * vocabulary may have too few distinct byte values per column to compare + * reliably; on anything below this threshold we fall back to the pre-gate + * behaviour (model + {@link WideUnicodeDetector} positive signal). Set + * above the size of typical short probes (a few hundred bytes) so real + * CJK UTF-16 text has room to diversify its high-byte column. + */ + public static final int MIN_COLUMN_ASYMMETRY_PROBE = 2048; + + /** + * Returns {@code true} if the probe's byte distribution across stride-2 + * columns is sufficiently asymmetric to be plausible UTF-16 of some script. + * + * <p>Every UTF-16 variant has one byte column concentrated in a + * script-specific Unicode block prefix while the other column is diverse: + * UTF-16 Latin pairs to {@code (ascii, 0x00)} so one column is {@code 0x00} + * (1 value) vs ASCII range (~70 values); UTF-16 Cyrillic / Greek / Arabic / + * Hebrew pair to a single high-byte block prefix ({@code 0x04}, {@code 0x03}, + * {@code 0x06}, {@code 0x05}); UTF-16 CJK Unified uses {@code 0x4E}-{@code 0x9F} + * (~80 distinct high bytes) against ~256 low bytes; Hangul uses + * {@code 0xAC}-{@code 0xD7} (~44 high bytes).</p> + * + * <p>Non-UTF-16 text — including scattered-null binaries and mixed-content + * files — has roughly balanced column diversity (both columns saturate + * near 256 distinct byte values on long probes).</p> + * + * <p>This is a <em>negative</em> gate: when it returns {@code false}, the + * probe cannot be any UTF-16 variant, and UTF-16 labels should be masked + * from model output even when the stride-2 bigram features score them + * highly (e.g. a Greek plaintext file with 0.36% scattered nulls being + * mis-scored as UTF-16-LE).</p> + * + * <p>A diversity ratio of 3× (more diverse column has at least 3× as many + * distinct values as the more concentrated column) admits all UTF-16 + * variants including CJK (ratio ~3.2) while rejecting scattered-null + * false positives (ratio ~1:1).</p> + * + * <p>For probes shorter than {@link #MIN_COLUMN_ASYMMETRY_PROBE}, this + * method returns {@code true} conservatively — column counts from short + * samples are not statistically meaningful, so the caller should rely on + * {@link WideUnicodeDetector} positive signal and downstream CharSoup + * arbitration rather than masking.</p> + * + * @param bytes the probe to analyse + * @return {@code true} if the probe has UTF-16-compatible column asymmetry + * (or is too short to judge); {@code false} if column diversity is + * too balanced to be any UTF-16 variant + */ + public static boolean has2ByteColumnAsymmetry(byte[] bytes) { + if (bytes == null || bytes.length < MIN_COLUMN_ASYMMETRY_PROBE) { + return true; + } + int sample = Math.min(bytes.length, 4096); + boolean[] evenSeen = new boolean[256]; + boolean[] oddSeen = new boolean[256]; + int evenDistinct = 0; + int oddDistinct = 0; + for (int i = 0; i < sample; i++) { + int v = bytes[i] & 0xFF; + if ((i & 1) == 0) { + if (!evenSeen[v]) { + evenSeen[v] = true; + evenDistinct++; + } + } else { + if (!oddSeen[v]) { + oddSeen[v] = true; + oddDistinct++; + } + } + } + int min = Math.min(evenDistinct, oddDistinct); + int max = Math.max(evenDistinct, oddDistinct); + return max >= min * 3; + } + public static boolean checkIbm424(byte[] bytes, int offset, int length) { if (length < 8) { return false;
