This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 155a62071491763c6a07d0d36efdfcd37ce28f2f Author: tallison <[email protected]> AuthorDate: Sun Apr 12 08:00:22 2026 -0400 ebcdic gate --- .../ml/chardetect/MojibusterEncodingDetector.java | 34 ++++++++++++++ .../ml/chardetect/StructuralEncodingRules.java | 53 ++++++++++++++++++++++ 2 files changed, 87 insertions(+) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 6f6590e38d..b232f884f0 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -422,6 +422,17 @@ public class MojibusterEncodingDetector implements EncodingDetector { int[] features = extractor.extract(probe); float[] logits = model.predictLogits(features); + // EBCDIC gate: if the probe lacks the EBCDIC word-separator pattern + // (0x40 dominant over 0x20), it cannot be any EBCDIC variant. The + // statistical model can produce very large logits for EBCDIC labels + // on predominantly-ASCII probes whose n-grams happen to align with + // training features (observed with 99%-ASCII vCards mis-scored at + // IBM424 logit 55 vs windows-1252 logit 26). Mask those labels out + // before ranking so downstream arbitration sees only plausible + // candidates. + boolean excludeEbcdic = enabledRules.contains(Rule.STRUCTURAL_GATES) + && !StructuralEncodingRules.isEbcdicLikely(probe); + for (int i = 0; i < logits.length; i++) { String lbl = model.getLabel(i); if (excludeUtf8 && "UTF-8".equalsIgnoreCase(lbl)) { @@ -433,6 +444,9 @@ public class MojibusterEncodingDetector implements EncodingDetector { if (excludeUtf16Le && lbl.equalsIgnoreCase("UTF-16-LE")) { logits[i] = Float.NEGATIVE_INFINITY; } + if (excludeEbcdic && isEbcdicLabel(lbl)) { + logits[i] = Float.NEGATIVE_INFINITY; + } } List<EncodingResult> results = selectByLogitGap(model, logits, topN); @@ -706,6 +720,26 @@ public class MojibusterEncodingDetector implements EncodingDetector { } } + /** + * True EBCDIC variants that must be gated by {@link + * StructuralEncodingRules#isEbcdicLikely(byte[])}. + * + * <p>Note: {@code IBM850}, {@code IBM852}, {@code IBM855}, {@code IBM866}, + * and {@code IBM437} are DOS/OEM code pages, <em>not</em> EBCDIC — they + * use {@code 0x20} for space like ASCII and are therefore not gated.</p> + */ + private static boolean isEbcdicLabel(String label) { + if (label == null) { + return false; + } + return label.equals("IBM420-ltr") || label.equals("IBM420-rtl") + || label.equals("IBM420") + || label.equals("IBM424-ltr") || label.equals("IBM424-rtl") + || label.equals("IBM424") + || label.equals("IBM500") + || label.equals("IBM1047"); + } + private static byte[] readProbe(TikaInputStream is, int maxBytes) throws IOException { is.mark(maxBytes); try { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java index a7114527b0..101e00484c 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java @@ -202,6 +202,59 @@ public final class StructuralEncodingRules { return checkIbm424(bytes, 0, bytes.length); } + /** + * Returns {@code true} if the probe is plausibly EBCDIC based on the + * word-separator distribution. In every EBCDIC variant (IBM420, IBM424, + * IBM500, IBM1047) the space character is {@code 0x40}, not {@code 0x20}; + * a stretch of EBCDIC text therefore has {@code 0x40} as its single most + * common byte, at roughly 10–20% of the sample. Conversely, any ASCII + * or ISO-8859-X / windows-12XX / DOS / Mac / CJK text uses {@code 0x20} + * (or {@code 0x09 / 0x0A}) as its whitespace and has {@code 0x40} only + * as the rare {@code @} character (typically less than 0.1% of bytes). + * + * <p>This is a <em>negative</em> gate: when it returns {@code false}, the + * probe cannot be any EBCDIC variant, and downstream scoring should + * exclude EBCDIC labels from consideration even if the statistical model + * ranks them highly.</p> + * + * <p>Threshold rationale: we require both (a) {@code 0x40} at least 3% + * of the sample and (b) {@code 0x40} at least 3× more frequent + * than {@code 0x20}. Gate (b) alone is not sufficient because sparse + * binary content can have neither byte; gate (a) alone is not sufficient + * because some text formats (CSV with {@code @}-separated fields, + * e-mail address lists) can exceed 3% {@code 0x40} while clearly being + * ASCII-spaced. Both gates together match real EBCDIC text reliably + * across IBM420/424/500/1047 variants.</p> + * + * @param bytes the probe to analyse + * @return {@code true} if the probe's whitespace distribution is + * consistent with EBCDIC; {@code false} if it is clearly ASCII-spaced + */ + public static boolean isEbcdicLikely(byte[] bytes) { + if (bytes == null || bytes.length < 8) { + return false; + } + int sample = Math.min(bytes.length, 4096); + int ebcdicSpace = 0; + int asciiSpace = 0; + int prev = -1; + for (int i = 0; i < sample; i++) { + int b = bytes[i] & 0xFF; + if (b == 0x40) { + // Guard against Shift_JIS trail bytes that happen to equal 0x40. + boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F) + || (prev >= 0xE0 && prev <= 0xFC); + if (!isShiftJisTrail) { + ebcdicSpace++; + } + } else if (b == 0x20) { + asciiSpace++; + } + prev = b; + } + return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3; + } + public static boolean checkIbm424(byte[] bytes, int offset, int length) { if (length < 8) { return false;
