This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 41784517b5fb987332206a7257a5a01fd23a0334 Author: tallison <[email protected]> AuthorDate: Sun Apr 12 15:51:20 2026 -0400 sparse-Latin vCard IBM424 false positive test --- .../chardetect/SparseLatinVcardRegressionTest.java | 116 +++++++++++++++++++++ 1 file changed, 116 insertions(+) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java new file mode 100644 index 0000000000..16b49346fc --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/SparseLatinVcardRegressionTest.java @@ -0,0 +1,116 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +/** + * Regression test for the sparse-Latin vCard / config-file detection + * class. + * + * <p>Before the {@link StructuralEncodingRules#isEbcdicLikely(byte[])} + * gate and the {@link MojibusterEncodingDetector.Rule#LOSSLESS_WIN1252_CANONICALISATION} + * post-rule, a predominantly-ASCII probe with a small number of + * Latin-supplement high bytes (e.g. a vCard containing a German + * business name) detected as {@code IBM424} (Hebrew EBCDIC) at 0.99 + * confidence — producing complete mojibake with dice=0 vs the 3.x + * baseline.</p> + * + * <p>After the fixes, the same probe detects as {@code windows-1252}, + * preserving content fidelity.</p> + */ +public class SparseLatinVcardRegressionTest { + + /** + * End-to-end regression assertion: the synthetic sparse-Latin vCard + * must detect as {@code windows-1252}, not {@code IBM424} or a + * byte-equivalent {@code windows-1257 / windows-1254 / x-MacRoman} + * sibling. + */ + @Test + public void sparseLatinVcardDetectsAsWindows1252() throws Exception { + byte[] probe = buildSparseLatinVcard(); + + MojibusterEncodingDetector detector = new MojibusterEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get(probe)) { + List<EncodingResult> results = detector.detect( + tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), + "Detector must return at least one candidate"); + assertEquals("windows-1252", results.get(0).getCharset().name(), + "Sparse-Latin vCard must detect as windows-1252, not " + + "IBM424 / windows-1257 / windows-1254 / x-MacRoman"); + } + } + + /** + * Synthetic vCard-shaped probe that reproduces the regression class. + * + * <p>Preserved byte statistics from the original failing file: + * <ul> + * <li>Length in the 400-600 byte range (long-probe path).</li> + * <li>Exactly 3 non-ASCII bytes, all {@code 0xE4} — 'ä' under + * ISO-8859-1 / windows-1252 / windows-1257. The extreme-sparse + * regime where the flat statistical model was overconfidently + * wrong.</li> + * <li>Zero C1 bytes ({@code 0x80–0x9F}) so ISO→Windows upgrade + * does not fire.</li> + * <li>LF line endings only (no CRLF) so CRLF→Windows upgrade + * does not fire.</li> + * <li>Zero {@code 0x40} bytes so the EBCDIC gate cleanly returns + * {@code false}.</li> + * </ul> + * + * <p>Content is a fictitious German bakery at a fictitious address. + * No real business or person is represented.</p> + */ + private static byte[] buildSparseLatinVcard() { + String vcard = + "BEGIN:VCARD\n" + + "\t\t\t\t\tVERSION:3.0\n" + + "\t\t\t\t\tN:Example B\u00E4ckerei GmbH\n" + + "\t\t\t\t\tFN:Example B\u00E4ckerei GmbH\n" + + "\t\t\t\t\tORG:Example B\u00E4ckerei GmbH;\n" + + "\t\t\t\t\tPHOTO;VALUE=URL;TYPE=jpg:" + + "https://example.com/images/logo.jpg\n" + + "\t\t\t\t\titem1.EMAIL;TYPE=PREF,INTERNET:\n" + + "\t\t\t\t\titem1.X-ABLabel:email\n" + + "\t\t\t\t\tTEL;TYPE=WORK,VOICE:\n" + + "\t\t\t\t\tTEL;TYPE=WORK,FAX:\n" + + "\t\t\t\t\titem2.ADR;TYPE=WORK:" + + ";;Teststr. 1;Musterstadt;;12345;Germany;\n" + + "\t\t\t\t\titem2.X-ABADR:de\n" + + "\t\t\t\t\tLABEL;TYPE=WORK:Teststr. 1 Musterstadt, 12345\n" + + "\t\t\t\t\tURL;TYPE=PREF:\n" + + "\t\t\t\t\tREV:2026-04-12 12:00:00\n" + + "\t\t\t\t\tNOTE:Synthetic test fixture for charset " + + "detector regression coverage\n" + + "\t\t\t\t\tEND:VCARD\n"; + return vcard.getBytes(StandardCharsets.ISO_8859_1); + } +}
