(tika) 03/03: split strides into separate model space

tallison Mon, 13 Apr 2026 10:40:24 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 72a8b10884870353b65534f870d7232b48bdeddc
Author: tallison <[email protected]>
AuthorDate: Mon Apr 13 13:39:29 2026 -0400

    split strides into separate model space
---
 .../ml/chardetect/MojibusterEncodingDetector.java  |  59 ++++++++--
 .../ml/chardetect/StructuralEncodingRules.java     | 131 +++++++++++++++++++++
 .../chardetect/SparseLatinVcardRegressionTest.java | 116 ++++++++++++++++++
 3 files changed, 299 insertions(+), 7 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 69cbdc9163..e14d4c6b84 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -117,10 +117,6 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
          * the first disagreeing high byte.  Zero cost for probes whose top
          * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic,
          * Arabic, Greek, Hebrew).
-         *
-         * <p>Narrow by design — see {@code charset-detection.md} for the
-         * full options discussion (generalized candidate expansion and
-         * per-family canonicals were considered and rejected for now).</p>
          */
         LATIN_FALLBACK_WIN1252
     }
@@ -163,8 +159,12 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         LABEL_TO_JAVA_NAME = Collections.unmodifiableMap(m);
     }
 
-    /** Default number of bytes read from the stream for detection. */
-    public static final int MAX_PROBE_BYTES = 4096;
+    /**
+     * Default number of bytes read from the stream for detection.
+     * Set generously so HTML/XML probes reach body text past
+     * ASCII-heavy head / script sections.
+     */
+    public static final int MAX_PROBE_BYTES = 32768;
 
     /**
      * JSON-deserializable configuration for {@link 
MojibusterEncodingDetector}.
@@ -420,8 +420,19 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         boolean excludeUtf8 = gates
                 && StructuralEncodingRules.checkUtf8(probe) == 
StructuralEncodingRules.Utf8Result.NOT_UTF8;
 
+        // UTF-16 structural gate: stride-2 bigram features can misfire on
+        // non-UTF-16 probes with scattered nulls (e.g. Greek plaintext with
+        // 0.3% nulls scoring as UTF-16-LE). Real UTF-16 of any script has a
+        // concentrated byte column paired with a diverse one; scattered nulls
+        // produce ~balanced column diversity. Mask UTF-16 labels when the
+        // column-asymmetry test fails.
+        boolean utf16Plausible = !gates
+                || StructuralEncodingRules.has2ByteColumnAsymmetry(probe);
+        boolean excludeUtf16Be = wideResult.invalidUtf16Be || !utf16Plausible;
+        boolean excludeUtf16Le = wideResult.invalidUtf16Le || !utf16Plausible;
+
         List<EncodingResult> results = runModel(probe, excludeUtf8,
-                wideResult.invalidUtf16Be, wideResult.invalidUtf16Le, topN);
+                excludeUtf16Be, excludeUtf16Le, topN);
 
         // If the model had no evidence (probe too short or all tokens 
filtered), fall back to
         // windows-1252 at very low confidence rather than returning empty and 
letting
@@ -438,6 +449,17 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         int[] features = extractor.extract(probe);
         float[] logits = model.predictLogits(features);
 
+        // EBCDIC gate: if the probe lacks the EBCDIC word-separator pattern
+        // (0x40 dominant over 0x20), it cannot be any EBCDIC variant. The
+        // statistical model can produce very large logits for EBCDIC labels
+        // on predominantly-ASCII probes whose n-grams happen to align with
+        // training features (observed with 99%-ASCII vCards mis-scored at
+        // IBM424 logit 55 vs windows-1252 logit 26). Mask those labels out
+        // before ranking so downstream arbitration sees only plausible
+        // candidates.
+        boolean excludeEbcdic = enabledRules.contains(Rule.STRUCTURAL_GATES)
+                && !StructuralEncodingRules.isEbcdicLikely(probe);
+
         for (int i = 0; i < logits.length; i++) {
             String lbl = model.getLabel(i);
             if (excludeUtf8 && "UTF-8".equalsIgnoreCase(lbl)) {
@@ -449,6 +471,9 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             if (excludeUtf16Le && lbl.equalsIgnoreCase("UTF-16-LE")) {
                 logits[i] = Float.NEGATIVE_INFINITY;
             }
+            if (excludeEbcdic && isEbcdicLabel(lbl)) {
+                logits[i] = Float.NEGATIVE_INFINITY;
+            }
         }
 
         List<EncodingResult> results = selectByLogitGap(model, logits, topN);
@@ -766,6 +791,26 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         }
     }
 
+    /**
+     * True EBCDIC variants that must be gated by {@link
+     * StructuralEncodingRules#isEbcdicLikely(byte[])}.
+     *
+     * <p>Note: {@code IBM850}, {@code IBM852}, {@code IBM855}, {@code IBM866},
+     * and {@code IBM437} are DOS/OEM code pages, <em>not</em> EBCDIC — they
+     * use {@code 0x20} for space like ASCII and are therefore not gated.</p>
+     */
+    private static boolean isEbcdicLabel(String label) {
+        if (label == null) {
+            return false;
+        }
+        return label.equals("IBM420-ltr") || label.equals("IBM420-rtl")
+                || label.equals("IBM420")
+                || label.equals("IBM424-ltr") || label.equals("IBM424-rtl")
+                || label.equals("IBM424")
+                || label.equals("IBM500")
+                || label.equals("IBM1047");
+    }
+
     private static byte[] readProbe(TikaInputStream is, int maxBytes) throws 
IOException {
         is.mark(maxBytes);
         try {
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index a7114527b0..beaffc7475 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -202,6 +202,137 @@ public final class StructuralEncodingRules {
         return checkIbm424(bytes, 0, bytes.length);
     }
 
+    /**
+     * Returns {@code true} if the probe is plausibly EBCDIC based on the
+     * word-separator distribution.  In every EBCDIC variant (IBM420, IBM424,
+     * IBM500, IBM1047) the space character is {@code 0x40}, not {@code 0x20};
+     * a stretch of EBCDIC text therefore has {@code 0x40} as its single most
+     * common byte, at roughly 10–20% of the sample.  Conversely, any ASCII
+     * or ISO-8859-X / windows-12XX / DOS / Mac / CJK text uses {@code 0x20}
+     * (or {@code 0x09 / 0x0A}) as its whitespace and has {@code 0x40} only
+     * as the rare {@code @} character (typically less than 0.1% of bytes).
+     *
+     * <p>This is a <em>negative</em> gate: when it returns {@code false}, the
+     * probe cannot be any EBCDIC variant, and downstream scoring should
+     * exclude EBCDIC labels from consideration even if the statistical model
+     * ranks them highly.</p>
+     *
+     * <p>Threshold rationale: we require both (a) {@code 0x40} at least 3%
+     * of the sample and (b) {@code 0x40} at least 3&times; more frequent
+     * than {@code 0x20}.  Gate (b) alone is not sufficient because sparse
+     * binary content can have neither byte; gate (a) alone is not sufficient
+     * because some text formats (CSV with {@code @}-separated fields,
+     * e-mail address lists) can exceed 3% {@code 0x40} while clearly being
+     * ASCII-spaced.  Both gates together match real EBCDIC text reliably
+     * across IBM420/424/500/1047 variants.</p>
+     *
+     * @param bytes the probe to analyse
+     * @return {@code true} if the probe's whitespace distribution is
+     *         consistent with EBCDIC; {@code false} if it is clearly 
ASCII-spaced
+     */
+    public static boolean isEbcdicLikely(byte[] bytes) {
+        if (bytes == null || bytes.length < 8) {
+            return false;
+        }
+        int sample = Math.min(bytes.length, 4096);
+        int ebcdicSpace = 0;
+        int asciiSpace = 0;
+        int prev = -1;
+        for (int i = 0; i < sample; i++) {
+            int b = bytes[i] & 0xFF;
+            if (b == 0x40) {
+                // Guard against Shift_JIS trail bytes that happen to equal 
0x40.
+                boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F)
+                        || (prev >= 0xE0 && prev <= 0xFC);
+                if (!isShiftJisTrail) {
+                    ebcdicSpace++;
+                }
+            } else if (b == 0x20) {
+                asciiSpace++;
+            }
+            prev = b;
+        }
+        return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3;
+    }
+
+    /**
+     * Minimum probe length before {@link #has2ByteColumnAsymmetry} produces
+     * meaningful diversity counts.  Short probes or probes with limited
+     * vocabulary may have too few distinct byte values per column to compare
+     * reliably; on anything below this threshold we fall back to the pre-gate
+     * behaviour (model + {@link WideUnicodeDetector} positive signal).  Set
+     * above the size of typical short probes (a few hundred bytes) so real
+     * CJK UTF-16 text has room to diversify its high-byte column.
+     */
+    public static final int MIN_COLUMN_ASYMMETRY_PROBE = 2048;
+
+    /**
+     * Returns {@code true} if the probe's byte distribution across stride-2
+     * columns is sufficiently asymmetric to be plausible UTF-16 of some 
script.
+     *
+     * <p>Every UTF-16 variant has one byte column concentrated in a
+     * script-specific Unicode block prefix while the other column is diverse:
+     * UTF-16 Latin pairs to {@code (ascii, 0x00)} so one column is {@code 
0x00}
+     * (1 value) vs ASCII range (~70 values); UTF-16 Cyrillic / Greek / Arabic 
/
+     * Hebrew pair to a single high-byte block prefix ({@code 0x04}, {@code 
0x03},
+     * {@code 0x06}, {@code 0x05}); UTF-16 CJK Unified uses {@code 
0x4E}-{@code 0x9F}
+     * (~80 distinct high bytes) against ~256 low bytes; Hangul uses
+     * {@code 0xAC}-{@code 0xD7} (~44 high bytes).</p>
+     *
+     * <p>Non-UTF-16 text — including scattered-null binaries and mixed-content
+     * files — has roughly balanced column diversity (both columns saturate
+     * near 256 distinct byte values on long probes).</p>
+     *
+     * <p>This is a <em>negative</em> gate: when it returns {@code false}, the
+     * probe cannot be any UTF-16 variant, and UTF-16 labels should be masked
+     * from model output even when the stride-2 bigram features score them
+     * highly (e.g. a Greek plaintext file with 0.36% scattered nulls being
+     * mis-scored as UTF-16-LE).</p>
+     *
+     * <p>A diversity ratio of 3× (more diverse column has at least 3× as many
+     * distinct values as the more concentrated column) admits all UTF-16
+     * variants including CJK (ratio ~3.2) while rejecting scattered-null
+     * false positives (ratio ~1:1).</p>
+     *
+     * <p>For probes shorter than {@link #MIN_COLUMN_ASYMMETRY_PROBE}, this
+     * method returns {@code true} conservatively — column counts from short
+     * samples are not statistically meaningful, so the caller should rely on
+     * {@link WideUnicodeDetector} positive signal and downstream CharSoup
+     * arbitration rather than masking.</p>
+     *
+     * @param bytes the probe to analyse
+     * @return {@code true} if the probe has UTF-16-compatible column asymmetry
+     *         (or is too short to judge); {@code false} if column diversity is
+     *         too balanced to be any UTF-16 variant
+     */
+    public static boolean has2ByteColumnAsymmetry(byte[] bytes) {
+        if (bytes == null || bytes.length < MIN_COLUMN_ASYMMETRY_PROBE) {
+            return true;
+        }
+        int sample = Math.min(bytes.length, 4096);
+        boolean[] evenSeen = new boolean[256];
+        boolean[] oddSeen = new boolean[256];
+        int evenDistinct = 0;
+        int oddDistinct = 0;
+        for (int i = 0; i < sample; i++) {
+            int v = bytes[i] & 0xFF;
+            if ((i & 1) == 0) {
+                if (!evenSeen[v]) {
+                    evenSeen[v] = true;
+                    evenDistinct++;
+                }
+            } else {
+                if (!oddSeen[v]) {
+                    oddSeen[v] = true;
+                    oddDistinct++;
+                }
+            }
+        }
+        int min = Math.min(evenDistinct, oddDistinct);
+        int max = Math.max(evenDistinct, oddDistinct);
+        return max >= min * 3;
+    }
+
     public static boolean checkIbm424(byte[] bytes, int offset, int length) {
         if (length < 8) {
             return false;
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
new file mode 100644
index 0000000000..2d84959ca3
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+import java.nio.charset.StandardCharsets;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Regression test for the sparse-Latin vCard / config-file detection
+ * class.
+ *
+ * <p>Before the {@link StructuralEncodingRules#isEbcdicLikely(byte[])}
+ * gate and the {@link MojibusterEncodingDetector.Rule#LATIN_FALLBACK_WIN1252}
+ * post-rule, a predominantly-ASCII probe with a small number of
+ * Latin-supplement high bytes (e.g. a vCard containing a German
+ * business name) detected as {@code IBM424} (Hebrew EBCDIC) at 0.99
+ * confidence — producing complete mojibake with dice=0 vs the 3.x
+ * baseline.</p>
+ *
+ * <p>After the fixes, the same probe detects as {@code windows-1252},
+ * preserving content fidelity.</p>
+ */
+public class SparseLatinVcardRegressionTest {
+
+    /**
+     * End-to-end regression assertion: the synthetic sparse-Latin vCard
+     * must detect as {@code windows-1252}, not {@code IBM424} or a
+     * byte-equivalent {@code windows-1257 / windows-1254 / x-MacRoman}
+     * sibling.
+     */
+    @Test
+    public void sparseLatinVcardDetectsAsWindows1252() throws Exception {
+        byte[] probe = buildSparseLatinVcard();
+
+        MojibusterEncodingDetector detector = new MojibusterEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(probe)) {
+            List<EncodingResult> results = detector.detect(
+                    tis, new Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(),
+                    "Detector must return at least one candidate");
+            assertEquals("windows-1252", results.get(0).getCharset().name(),
+                    "Sparse-Latin vCard must detect as windows-1252, not "
+                            + "IBM424 / windows-1257 / windows-1254 / 
x-MacRoman");
+        }
+    }
+
+    /**
+     * Synthetic vCard-shaped probe that reproduces the regression class.
+     *
+     * <p>Preserved byte statistics from the original failing file:
+     * <ul>
+     *   <li>Length in the 400-600 byte range (long-probe path).</li>
+     *   <li>Exactly 3 non-ASCII bytes, all {@code 0xE4} — 'ä' under
+     *       ISO-8859-1 / windows-1252 / windows-1257. The extreme-sparse
+     *       regime where the flat statistical model was overconfidently
+     *       wrong.</li>
+     *   <li>Zero C1 bytes ({@code 0x80–0x9F}) so ISO→Windows upgrade
+     *       does not fire.</li>
+     *   <li>LF line endings only (no CRLF) so CRLF→Windows upgrade
+     *       does not fire.</li>
+     *   <li>Zero {@code 0x40} bytes so the EBCDIC gate cleanly returns
+     *       {@code false}.</li>
+     * </ul>
+     *
+     * <p>Content is a fictitious German bakery at a fictitious address.
+     * No real business or person is represented.</p>
+     */
+    private static byte[] buildSparseLatinVcard() {
+        String vcard =
+                  "BEGIN:VCARD\n"
+                + "\t\t\t\t\tVERSION:3.0\n"
+                + "\t\t\t\t\tN:Example B\u00E4ckerei GmbH\n"
+                + "\t\t\t\t\tFN:Example B\u00E4ckerei GmbH\n"
+                + "\t\t\t\t\tORG:Example B\u00E4ckerei GmbH;\n"
+                + "\t\t\t\t\tPHOTO;VALUE=URL;TYPE=jpg:"
+                        + "https://example.com/images/logo.jpg\n";
+                + "\t\t\t\t\titem1.EMAIL;TYPE=PREF,INTERNET:\n"
+                + "\t\t\t\t\titem1.X-ABLabel:email\n"
+                + "\t\t\t\t\tTEL;TYPE=WORK,VOICE:\n"
+                + "\t\t\t\t\tTEL;TYPE=WORK,FAX:\n"
+                + "\t\t\t\t\titem2.ADR;TYPE=WORK:"
+                        + ";;Teststr. 1;Musterstadt;;12345;Germany;\n"
+                + "\t\t\t\t\titem2.X-ABADR:de\n"
+                + "\t\t\t\t\tLABEL;TYPE=WORK:Teststr. 1 Musterstadt, 12345\n"
+                + "\t\t\t\t\tURL;TYPE=PREF:\n"
+                + "\t\t\t\t\tREV:2026-04-12 12:00:00\n"
+                + "\t\t\t\t\tNOTE:Synthetic test fixture for charset "
+                        + "detector regression coverage\n"
+                + "\t\t\t\t\tEND:VCARD\n";
+        return vcard.getBytes(StandardCharsets.ISO_8859_1);
+    }
+}

(tika) 03/03: split strides into separate model space

Reply via email to