(tika) 01/06: ebcdic gate

tallison Mon, 13 Apr 2026 10:28:25 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 155a62071491763c6a07d0d36efdfcd37ce28f2f
Author: tallison <[email protected]>
AuthorDate: Sun Apr 12 08:00:22 2026 -0400

    ebcdic gate
---
 .../ml/chardetect/MojibusterEncodingDetector.java  | 34 ++++++++++++++
 .../ml/chardetect/StructuralEncodingRules.java     | 53 ++++++++++++++++++++++
 2 files changed, 87 insertions(+)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 6f6590e38d..b232f884f0 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -422,6 +422,17 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         int[] features = extractor.extract(probe);
         float[] logits = model.predictLogits(features);
 
+        // EBCDIC gate: if the probe lacks the EBCDIC word-separator pattern
+        // (0x40 dominant over 0x20), it cannot be any EBCDIC variant. The
+        // statistical model can produce very large logits for EBCDIC labels
+        // on predominantly-ASCII probes whose n-grams happen to align with
+        // training features (observed with 99%-ASCII vCards mis-scored at
+        // IBM424 logit 55 vs windows-1252 logit 26). Mask those labels out
+        // before ranking so downstream arbitration sees only plausible
+        // candidates.
+        boolean excludeEbcdic = enabledRules.contains(Rule.STRUCTURAL_GATES)
+                && !StructuralEncodingRules.isEbcdicLikely(probe);
+
         for (int i = 0; i < logits.length; i++) {
             String lbl = model.getLabel(i);
             if (excludeUtf8 && "UTF-8".equalsIgnoreCase(lbl)) {
@@ -433,6 +444,9 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             if (excludeUtf16Le && lbl.equalsIgnoreCase("UTF-16-LE")) {
                 logits[i] = Float.NEGATIVE_INFINITY;
             }
+            if (excludeEbcdic && isEbcdicLabel(lbl)) {
+                logits[i] = Float.NEGATIVE_INFINITY;
+            }
         }
 
         List<EncodingResult> results = selectByLogitGap(model, logits, topN);
@@ -706,6 +720,26 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         }
     }
 
+    /**
+     * True EBCDIC variants that must be gated by {@link
+     * StructuralEncodingRules#isEbcdicLikely(byte[])}.
+     *
+     * <p>Note: {@code IBM850}, {@code IBM852}, {@code IBM855}, {@code IBM866},
+     * and {@code IBM437} are DOS/OEM code pages, <em>not</em> EBCDIC — they
+     * use {@code 0x20} for space like ASCII and are therefore not gated.</p>
+     */
+    private static boolean isEbcdicLabel(String label) {
+        if (label == null) {
+            return false;
+        }
+        return label.equals("IBM420-ltr") || label.equals("IBM420-rtl")
+                || label.equals("IBM420")
+                || label.equals("IBM424-ltr") || label.equals("IBM424-rtl")
+                || label.equals("IBM424")
+                || label.equals("IBM500")
+                || label.equals("IBM1047");
+    }
+
     private static byte[] readProbe(TikaInputStream is, int maxBytes) throws 
IOException {
         is.mark(maxBytes);
         try {
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index a7114527b0..101e00484c 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -202,6 +202,59 @@ public final class StructuralEncodingRules {
         return checkIbm424(bytes, 0, bytes.length);
     }
 
+    /**
+     * Returns {@code true} if the probe is plausibly EBCDIC based on the
+     * word-separator distribution.  In every EBCDIC variant (IBM420, IBM424,
+     * IBM500, IBM1047) the space character is {@code 0x40}, not {@code 0x20};
+     * a stretch of EBCDIC text therefore has {@code 0x40} as its single most
+     * common byte, at roughly 10–20% of the sample.  Conversely, any ASCII
+     * or ISO-8859-X / windows-12XX / DOS / Mac / CJK text uses {@code 0x20}
+     * (or {@code 0x09 / 0x0A}) as its whitespace and has {@code 0x40} only
+     * as the rare {@code @} character (typically less than 0.1% of bytes).
+     *
+     * <p>This is a <em>negative</em> gate: when it returns {@code false}, the
+     * probe cannot be any EBCDIC variant, and downstream scoring should
+     * exclude EBCDIC labels from consideration even if the statistical model
+     * ranks them highly.</p>
+     *
+     * <p>Threshold rationale: we require both (a) {@code 0x40} at least 3%
+     * of the sample and (b) {@code 0x40} at least 3&times; more frequent
+     * than {@code 0x20}.  Gate (b) alone is not sufficient because sparse
+     * binary content can have neither byte; gate (a) alone is not sufficient
+     * because some text formats (CSV with {@code @}-separated fields,
+     * e-mail address lists) can exceed 3% {@code 0x40} while clearly being
+     * ASCII-spaced.  Both gates together match real EBCDIC text reliably
+     * across IBM420/424/500/1047 variants.</p>
+     *
+     * @param bytes the probe to analyse
+     * @return {@code true} if the probe's whitespace distribution is
+     *         consistent with EBCDIC; {@code false} if it is clearly 
ASCII-spaced
+     */
+    public static boolean isEbcdicLikely(byte[] bytes) {
+        if (bytes == null || bytes.length < 8) {
+            return false;
+        }
+        int sample = Math.min(bytes.length, 4096);
+        int ebcdicSpace = 0;
+        int asciiSpace = 0;
+        int prev = -1;
+        for (int i = 0; i < sample; i++) {
+            int b = bytes[i] & 0xFF;
+            if (b == 0x40) {
+                // Guard against Shift_JIS trail bytes that happen to equal 
0x40.
+                boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F)
+                        || (prev >= 0xE0 && prev <= 0xFC);
+                if (!isShiftJisTrail) {
+                    ebcdicSpace++;
+                }
+            } else if (b == 0x20) {
+                asciiSpace++;
+            }
+            prev = b;
+        }
+        return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3;
+    }
+
     public static boolean checkIbm424(byte[] bytes, int offset, int length) {
         if (length < 8) {
             return false;

(tika) 01/06: ebcdic gate

Reply via email to