This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch charset-detection-improvements in repository https://gitbox.apache.org/repos/asf/tika.git
commit 72a8b10884870353b65534f870d7232b48bdeddc Author: tallison <[email protected]> AuthorDate: Mon Apr 13 13:39:29 2026 -0400 split strides into separate model space --- .../ml/chardetect/MojibusterEncodingDetector.java | 59 ++++++++-- .../ml/chardetect/StructuralEncodingRules.java | 131 +++++++++++++++++++++ .../chardetect/SparseLatinVcardRegressionTest.java | 116 ++++++++++++++++++ 3 files changed, 299 insertions(+), 7 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 69cbdc9163..e14d4c6b84 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -117,10 +117,6 @@ public class MojibusterEncodingDetector implements EncodingDetector { * the first disagreeing high byte. Zero cost for probes whose top * candidate isn't Latin-family (CJK, UTF-*, EBCDIC, Cyrillic, * Arabic, Greek, Hebrew). - * - * <p>Narrow by design — see {@code charset-detection.md} for the - * full options discussion (generalized candidate expansion and - * per-family canonicals were considered and rejected for now).</p> */ LATIN_FALLBACK_WIN1252 } @@ -163,8 +159,12 @@ public class MojibusterEncodingDetector implements EncodingDetector { LABEL_TO_JAVA_NAME = Collections.unmodifiableMap(m); } - /** Default number of bytes read from the stream for detection. */ - public static final int MAX_PROBE_BYTES = 4096; + /** + * Default number of bytes read from the stream for detection. + * Set generously so HTML/XML probes reach body text past + * ASCII-heavy head / script sections. + */ + public static final int MAX_PROBE_BYTES = 32768; /** * JSON-deserializable configuration for {@link MojibusterEncodingDetector}. @@ -420,8 +420,19 @@ public class MojibusterEncodingDetector implements EncodingDetector { boolean excludeUtf8 = gates && StructuralEncodingRules.checkUtf8(probe) == StructuralEncodingRules.Utf8Result.NOT_UTF8; + // UTF-16 structural gate: stride-2 bigram features can misfire on + // non-UTF-16 probes with scattered nulls (e.g. Greek plaintext with + // 0.3% nulls scoring as UTF-16-LE). Real UTF-16 of any script has a + // concentrated byte column paired with a diverse one; scattered nulls + // produce ~balanced column diversity. Mask UTF-16 labels when the + // column-asymmetry test fails. + boolean utf16Plausible = !gates + || StructuralEncodingRules.has2ByteColumnAsymmetry(probe); + boolean excludeUtf16Be = wideResult.invalidUtf16Be || !utf16Plausible; + boolean excludeUtf16Le = wideResult.invalidUtf16Le || !utf16Plausible; + List<EncodingResult> results = runModel(probe, excludeUtf8, - wideResult.invalidUtf16Be, wideResult.invalidUtf16Le, topN); + excludeUtf16Be, excludeUtf16Le, topN); // If the model had no evidence (probe too short or all tokens filtered), fall back to // windows-1252 at very low confidence rather than returning empty and letting @@ -438,6 +449,17 @@ public class MojibusterEncodingDetector implements EncodingDetector { int[] features = extractor.extract(probe); float[] logits = model.predictLogits(features); + // EBCDIC gate: if the probe lacks the EBCDIC word-separator pattern + // (0x40 dominant over 0x20), it cannot be any EBCDIC variant. The + // statistical model can produce very large logits for EBCDIC labels + // on predominantly-ASCII probes whose n-grams happen to align with + // training features (observed with 99%-ASCII vCards mis-scored at + // IBM424 logit 55 vs windows-1252 logit 26). Mask those labels out + // before ranking so downstream arbitration sees only plausible + // candidates. + boolean excludeEbcdic = enabledRules.contains(Rule.STRUCTURAL_GATES) + && !StructuralEncodingRules.isEbcdicLikely(probe); + for (int i = 0; i < logits.length; i++) { String lbl = model.getLabel(i); if (excludeUtf8 && "UTF-8".equalsIgnoreCase(lbl)) { @@ -449,6 +471,9 @@ public class MojibusterEncodingDetector implements EncodingDetector { if (excludeUtf16Le && lbl.equalsIgnoreCase("UTF-16-LE")) { logits[i] = Float.NEGATIVE_INFINITY; } + if (excludeEbcdic && isEbcdicLabel(lbl)) { + logits[i] = Float.NEGATIVE_INFINITY; + } } List<EncodingResult> results = selectByLogitGap(model, logits, topN); @@ -766,6 +791,26 @@ public class MojibusterEncodingDetector implements EncodingDetector { } } + /** + * True EBCDIC variants that must be gated by {@link + * StructuralEncodingRules#isEbcdicLikely(byte[])}. + * + * <p>Note: {@code IBM850}, {@code IBM852}, {@code IBM855}, {@code IBM866}, + * and {@code IBM437} are DOS/OEM code pages, <em>not</em> EBCDIC — they + * use {@code 0x20} for space like ASCII and are therefore not gated.</p> + */ + private static boolean isEbcdicLabel(String label) { + if (label == null) { + return false; + } + return label.equals("IBM420-ltr") || label.equals("IBM420-rtl") + || label.equals("IBM420") + || label.equals("IBM424-ltr") || label.equals("IBM424-rtl") + || label.equals("IBM424") + || label.equals("IBM500") + || label.equals("IBM1047"); + } + private static byte[] readProbe(TikaInputStream is, int maxBytes) throws IOException { is.mark(maxBytes); try { diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java index a7114527b0..beaffc7475 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java @@ -202,6 +202,137 @@ public final class StructuralEncodingRules { return checkIbm424(bytes, 0, bytes.length); } + /** + * Returns {@code true} if the probe is plausibly EBCDIC based on the + * word-separator distribution. In every EBCDIC variant (IBM420, IBM424, + * IBM500, IBM1047) the space character is {@code 0x40}, not {@code 0x20}; + * a stretch of EBCDIC text therefore has {@code 0x40} as its single most + * common byte, at roughly 10–20% of the sample. Conversely, any ASCII + * or ISO-8859-X / windows-12XX / DOS / Mac / CJK text uses {@code 0x20} + * (or {@code 0x09 / 0x0A}) as its whitespace and has {@code 0x40} only + * as the rare {@code @} character (typically less than 0.1% of bytes). + * + * <p>This is a <em>negative</em> gate: when it returns {@code false}, the + * probe cannot be any EBCDIC variant, and downstream scoring should + * exclude EBCDIC labels from consideration even if the statistical model + * ranks them highly.</p> + * + * <p>Threshold rationale: we require both (a) {@code 0x40} at least 3% + * of the sample and (b) {@code 0x40} at least 3× more frequent + * than {@code 0x20}. Gate (b) alone is not sufficient because sparse + * binary content can have neither byte; gate (a) alone is not sufficient + * because some text formats (CSV with {@code @}-separated fields, + * e-mail address lists) can exceed 3% {@code 0x40} while clearly being + * ASCII-spaced. Both gates together match real EBCDIC text reliably + * across IBM420/424/500/1047 variants.</p> + * + * @param bytes the probe to analyse + * @return {@code true} if the probe's whitespace distribution is + * consistent with EBCDIC; {@code false} if it is clearly ASCII-spaced + */ + public static boolean isEbcdicLikely(byte[] bytes) { + if (bytes == null || bytes.length < 8) { + return false; + } + int sample = Math.min(bytes.length, 4096); + int ebcdicSpace = 0; + int asciiSpace = 0; + int prev = -1; + for (int i = 0; i < sample; i++) { + int b = bytes[i] & 0xFF; + if (b == 0x40) { + // Guard against Shift_JIS trail bytes that happen to equal 0x40. + boolean isShiftJisTrail = (prev >= 0x81 && prev <= 0x9F) + || (prev >= 0xE0 && prev <= 0xFC); + if (!isShiftJisTrail) { + ebcdicSpace++; + } + } else if (b == 0x20) { + asciiSpace++; + } + prev = b; + } + return ebcdicSpace >= sample * 0.03 && ebcdicSpace > asciiSpace * 3; + } + + /** + * Minimum probe length before {@link #has2ByteColumnAsymmetry} produces + * meaningful diversity counts. Short probes or probes with limited + * vocabulary may have too few distinct byte values per column to compare + * reliably; on anything below this threshold we fall back to the pre-gate + * behaviour (model + {@link WideUnicodeDetector} positive signal). Set + * above the size of typical short probes (a few hundred bytes) so real + * CJK UTF-16 text has room to diversify its high-byte column. + */ + public static final int MIN_COLUMN_ASYMMETRY_PROBE = 2048; + + /** + * Returns {@code true} if the probe's byte distribution across stride-2 + * columns is sufficiently asymmetric to be plausible UTF-16 of some script. + * + * <p>Every UTF-16 variant has one byte column concentrated in a + * script-specific Unicode block prefix while the other column is diverse: + * UTF-16 Latin pairs to {@code (ascii, 0x00)} so one column is {@code 0x00} + * (1 value) vs ASCII range (~70 values); UTF-16 Cyrillic / Greek / Arabic / + * Hebrew pair to a single high-byte block prefix ({@code 0x04}, {@code 0x03}, + * {@code 0x06}, {@code 0x05}); UTF-16 CJK Unified uses {@code 0x4E}-{@code 0x9F} + * (~80 distinct high bytes) against ~256 low bytes; Hangul uses + * {@code 0xAC}-{@code 0xD7} (~44 high bytes).</p> + * + * <p>Non-UTF-16 text — including scattered-null binaries and mixed-content + * files — has roughly balanced column diversity (both columns saturate + * near 256 distinct byte values on long probes).</p> + * + * <p>This is a <em>negative</em> gate: when it returns {@code false}, the + * probe cannot be any UTF-16 variant, and UTF-16 labels should be masked + * from model output even when the stride-2 bigram features score them + * highly (e.g. a Greek plaintext file with 0.36% scattered nulls being + * mis-scored as UTF-16-LE).</p> + * + * <p>A diversity ratio of 3× (more diverse column has at least 3× as many + * distinct values as the more concentrated column) admits all UTF-16 + * variants including CJK (ratio ~3.2) while rejecting scattered-null + * false positives (ratio ~1:1).</p> + * + * <p>For probes shorter than {@link #MIN_COLUMN_ASYMMETRY_PROBE}, this + * method returns {@code true} conservatively — column counts from short + * samples are not statistically meaningful, so the caller should rely on + * {@link WideUnicodeDetector} positive signal and downstream CharSoup + * arbitration rather than masking.</p> + * + * @param bytes the probe to analyse + * @return {@code true} if the probe has UTF-16-compatible column asymmetry + * (or is too short to judge); {@code false} if column diversity is + * too balanced to be any UTF-16 variant + */ + public static boolean has2ByteColumnAsymmetry(byte[] bytes) { + if (bytes == null || bytes.length < MIN_COLUMN_ASYMMETRY_PROBE) { + return true; + } + int sample = Math.min(bytes.length, 4096); + boolean[] evenSeen = new boolean[256]; + boolean[] oddSeen = new boolean[256]; + int evenDistinct = 0; + int oddDistinct = 0; + for (int i = 0; i < sample; i++) { + int v = bytes[i] & 0xFF; + if ((i & 1) == 0) { + if (!evenSeen[v]) { + evenSeen[v] = true; + evenDistinct++; + } + } else { + if (!oddSeen[v]) { + oddSeen[v] = true; + oddDistinct++; + } + } + } + int min = Math.min(evenDistinct, oddDistinct); + int max = Math.max(evenDistinct, oddDistinct); + return max >= min * 3; + } + public static boolean checkIbm424(byte[] bytes, int offset, int length) { if (length < 8) { return false; diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java new file mode 100644 index 0000000000..2d84959ca3 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +/** + * Regression test for the sparse-Latin vCard / config-file detection + * class. + * + * <p>Before the {@link StructuralEncodingRules#isEbcdicLikely(byte[])} + * gate and the {@link MojibusterEncodingDetector.Rule#LATIN_FALLBACK_WIN1252} + * post-rule, a predominantly-ASCII probe with a small number of + * Latin-supplement high bytes (e.g. a vCard containing a German + * business name) detected as {@code IBM424} (Hebrew EBCDIC) at 0.99 + * confidence — producing complete mojibake with dice=0 vs the 3.x + * baseline.</p> + * + * <p>After the fixes, the same probe detects as {@code windows-1252}, + * preserving content fidelity.</p> + */ +public class SparseLatinVcardRegressionTest { + + /** + * End-to-end regression assertion: the synthetic sparse-Latin vCard + * must detect as {@code windows-1252}, not {@code IBM424} or a + * byte-equivalent {@code windows-1257 / windows-1254 / x-MacRoman} + * sibling. + */ + @Test + public void sparseLatinVcardDetectsAsWindows1252() throws Exception { + byte[] probe = buildSparseLatinVcard(); + + MojibusterEncodingDetector detector = new MojibusterEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get(probe)) { + List<EncodingResult> results = detector.detect( + tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), + "Detector must return at least one candidate"); + assertEquals("windows-1252", results.get(0).getCharset().name(), + "Sparse-Latin vCard must detect as windows-1252, not " + + "IBM424 / windows-1257 / windows-1254 / x-MacRoman"); + } + } + + /** + * Synthetic vCard-shaped probe that reproduces the regression class. + * + * <p>Preserved byte statistics from the original failing file: + * <ul> + * <li>Length in the 400-600 byte range (long-probe path).</li> + * <li>Exactly 3 non-ASCII bytes, all {@code 0xE4} — 'ä' under + * ISO-8859-1 / windows-1252 / windows-1257. The extreme-sparse + * regime where the flat statistical model was overconfidently + * wrong.</li> + * <li>Zero C1 bytes ({@code 0x80–0x9F}) so ISO→Windows upgrade + * does not fire.</li> + * <li>LF line endings only (no CRLF) so CRLF→Windows upgrade + * does not fire.</li> + * <li>Zero {@code 0x40} bytes so the EBCDIC gate cleanly returns + * {@code false}.</li> + * </ul> + * + * <p>Content is a fictitious German bakery at a fictitious address. + * No real business or person is represented.</p> + */ + private static byte[] buildSparseLatinVcard() { + String vcard = + "BEGIN:VCARD\n" + + "\t\t\t\t\tVERSION:3.0\n" + + "\t\t\t\t\tN:Example B\u00E4ckerei GmbH\n" + + "\t\t\t\t\tFN:Example B\u00E4ckerei GmbH\n" + + "\t\t\t\t\tORG:Example B\u00E4ckerei GmbH;\n" + + "\t\t\t\t\tPHOTO;VALUE=URL;TYPE=jpg:" + + "https://example.com/images/logo.jpg\n" + + "\t\t\t\t\titem1.EMAIL;TYPE=PREF,INTERNET:\n" + + "\t\t\t\t\titem1.X-ABLabel:email\n" + + "\t\t\t\t\tTEL;TYPE=WORK,VOICE:\n" + + "\t\t\t\t\tTEL;TYPE=WORK,FAX:\n" + + "\t\t\t\t\titem2.ADR;TYPE=WORK:" + + ";;Teststr. 1;Musterstadt;;12345;Germany;\n" + + "\t\t\t\t\titem2.X-ABADR:de\n" + + "\t\t\t\t\tLABEL;TYPE=WORK:Teststr. 1 Musterstadt, 12345\n" + + "\t\t\t\t\tURL;TYPE=PREF:\n" + + "\t\t\t\t\tREV:2026-04-12 12:00:00\n" + + "\t\t\t\t\tNOTE:Synthetic test fixture for charset " + + "detector regression coverage\n" + + "\t\t\t\t\tEND:VCARD\n"; + return vcard.getBytes(StandardCharsets.ISO_8859_1); + } +}
