This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch chardet-work in repository https://gitbox.apache.org/repos/asf/tika.git
commit 534c72f36938c5be548c0b7386011ac1c58dc786 Author: tballison <[email protected]> AuthorDate: Thu Mar 5 18:12:35 2026 -0500 chardet - --- .../java/org/apache/tika/detect/BOMDetector.java | 18 +- .../tika/detect/CompositeEncodingDetector.java | 3 +- .../org/apache/tika/detect/EncodingDetector.java | 11 +- .../tika/detect/EncodingDetectorContext.java | 7 + .../org/apache/tika/detect/EncodingResult.java | 92 +++++++-- .../tika/detect/MetadataCharsetDetector.java | 132 ++++++++++++ .../tika/detect/OverrideEncodingDetector.java | 3 +- .../org.apache.tika.detect.EncodingDetector | 10 + .../charsoup/CharSoupEncodingDetector.java | 66 +++--- .../charsoup/CharSoupEncodingDetectorTest.java | 8 +- .../tika/parser/html/HtmlEncodingDetector.java | 3 +- .../StandardHtmlEncodingDetector.java | 105 ++++++---- .../tika-encoding-detector-icu4j/pom.xml | 12 ++ .../tika/parser/txt/CharsetDetectorTest.java | 1 + .../configs/tika-config-ignore-charset.json | 13 ++ .../resources/test-documents/multi-language.txt | 58 ++++++ .../src/test/resources/test-documents/resume.html | 99 +++++++++ .../resources/test-documents/testIgnoreCharset.txt | 4 + .../resources/test-documents/testTXT_win-1252.txt | 1 + .../test-documents/test_ignore_IBM420.html | Bin 0 -> 1869 bytes .../ml/chardetect/MojibusterEncodingDetector.java | 223 ++++++++++++++++----- .../ml/chardetect/StructuralEncodingRules.java | 29 +++ .../tika/ml/chardetect/chardetect-ebcdic.bin | Bin 5232 -> 7312 bytes .../org/apache/tika/ml/chardetect/chardetect.bin | Bin 508522 -> 410106 bytes .../tika/ml/chardetect/EbcdicRoutingTest.java | 132 ++++++++++++ .../ml/chardetect/ZipFilenameDetectionTest.java | 142 ++++++------- .../tika-encoding-detector-universal/pom.xml | 6 + .../tika/parser/txt/UniversalEncodingDetector.java | 2 +- .../charsoup/CharSoupLanguageDetector.java | 83 +++++--- .../chardetect/ByteNgramFeatureExtractorTest.java | 32 +-- .../apache/tika/parser/html/HtmlParserTest.java | 16 +- .../microsoft/POIContainerExtractionTest.java | 3 +- .../java/org/apache/tika/parser/pkg/ZipParser.java | 1 + .../tika-parser-text-module/pom.xml | 14 +- .../tika/parser/csv/TextAndCSVParserTest.java | 13 +- .../org/apache/tika/parser/txt/TXTParserTest.java | 49 +++-- .../tika/config/TikaEncodingDetectorTest.java | 61 ++++-- .../apache/tika/parser/AutoDetectParserTest.java | 4 +- .../tika/parser/microsoft/rtf/RTFParserTest.java | 4 +- .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- .../apache/tika/parser/pkg/PackageParserTest.java | 2 - .../TIKA-2485-encoding-detector-mark-limits.json | 9 +- 42 files changed, 1122 insertions(+), 351 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java index 322e307187..21e9ca08a7 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java @@ -32,16 +32,18 @@ import org.apache.tika.parser.ParseContext; /** * Encoding detector that identifies the character set from a byte-order mark - * (BOM) at the start of the stream. Returns a single result with confidence - * {@link EncodingResult#CONFIDENCE_DEFINITIVE} when a BOM is found. + * (BOM) at the start of the stream. Returns a single {@link EncodingResult.ResultType#DECLARATIVE} + * result when a BOM is found — a BOM is an explicit in-band declaration of encoding + * and takes priority over all statistical or structural inference. * - * <p>Not SPI-loaded by default — add explicitly to your encoding-detector - * chain when needed. UTF-16/32 content without a BOM is detected by - * {@code MojibusterEncodingDetector} via stride-2 byte n-gram features.</p> + * <p>SPI-loaded first in the default encoding-detector chain so that BOM evidence + * reaches {@code CharSoupEncodingDetector} before any statistical detector runs. + * {@code MojibusterEncodingDetector} strips the BOM from its own probe independently + * to ensure consistent model inference (BOMs are excluded from training data).</p> * * @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0) */ -@TikaComponent(spi = false) +@TikaComponent public class BOMDetector implements EncodingDetector { private static final ByteOrderMark[] BOMS = @@ -88,8 +90,8 @@ public class BOMDetector implements EncodingDetector { for (int i = 0; i < BOMS.length; i++) { ByteOrderMark bom = BOMS[i]; if (startsWith(bom, bytes) && CHARSETS[i] != null) { - return List.of(new EncodingResult(CHARSETS[i], - EncodingResult.CONFIDENCE_DEFINITIVE)); + return List.of(new EncodingResult(CHARSETS[i], 1.0f, + CHARSETS[i].name(), EncodingResult.ResultType.DECLARATIVE)); } } return Collections.emptyList(); diff --git a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java index 10285ba44a..fc8b0ab038 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java @@ -194,7 +194,8 @@ public class CompositeEncodingDetector implements EncodingDetector, Serializable sb.append(", "); } sb.append(r.getDetectorName()).append("->").append(r.getCharset().name()); - if (r.getConfidence() < EncodingResult.CONFIDENCE_DEFINITIVE) { + sb.append("[").append(r.getResultType()).append("]"); + if (r.getResultType() == EncodingResult.ResultType.STATISTICAL) { sb.append(String.format(java.util.Locale.ROOT, "(%.2f)", r.getConfidence())); } } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java index ff02a0bcb5..7522003c76 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java @@ -31,11 +31,12 @@ import org.apache.tika.parser.ParseContext; * metadata or the first few bytes of the document stream. * * <p>Detectors return a ranked list of {@link EncodingResult}s in descending - * confidence order. An empty list means no opinion. A single result with - * confidence {@link EncodingResult#CONFIDENCE_DEFINITIVE} (1.0) indicates a - * structural detection that requires no further arbitration. Multiple results - * or lower confidence values invite arbitration by a - * {@link MetaEncodingDetector}.</p> + * confidence order. An empty list means no opinion. Results carry a + * {@link EncodingResult.ResultType} indicating the nature of the evidence: + * {@code DECLARATIVE} (BOM, HTML meta charset), {@code STRUCTURAL} (byte-grammar + * proof), or {@code STATISTICAL} (probabilistic model). A + * {@link MetaEncodingDetector} uses these types to arbitrate when detectors + * disagree.</p> * * @since Apache Tika 0.4 */ diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java index 81d870e599..6957601e2c 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java @@ -144,6 +144,13 @@ public class EncodingDetectorContext { return encodingResults.get(0).getConfidence(); } + /** + * The {@link EncodingResult.ResultType} of the top-ranked result from this detector. + */ + public EncodingResult.ResultType getResultType() { + return encodingResults.get(0).getResultType(); + } + public String getDetectorName() { return detectorName; } diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java index 135e81240c..55724aefc7 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java +++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java @@ -19,20 +19,50 @@ package org.apache.tika.detect; import java.nio.charset.Charset; /** - * A charset detection result pairing a {@link Charset} with a confidence score. + * A charset detection result pairing a {@link Charset} with a confidence score + * and a {@link ResultType} indicating the nature of the evidence. * - * <p>Confidence is in the range {@code [0.0, 1.0]}. A score of {@code 1.0} - * indicates a definitive structural detection (e.g. UTF-16/32 from null-byte - * patterns, or a declared {@code charset} attribute in an HTML meta tag) that - * requires no further arbitration. Lower scores reflect statistical estimates - * where arbitration by a {@link MetaEncodingDetector} may improve accuracy.</p> + * <h3>Result types</h3> + * <ul> + * <li>{@link ResultType#DECLARATIVE} — the document explicitly stated its + * encoding (BOM, HTML {@code <meta charset>}). These are authoritative + * claims about author intent and get preference over inferred results + * <em>when consistent with the actual bytes</em>.</li> + * <li>{@link ResultType#STRUCTURAL} — byte-grammar proof (ISO-2022 escape + * sequences, UTF-8 multibyte validation). The encoding is proven by the + * byte structure itself, independent of any declaration.</li> + * <li>{@link ResultType#STATISTICAL} — probabilistic inference from a + * statistical model. The {@code confidence} float is meaningful here + * for ranking among candidates; for DECLARATIVE and STRUCTURAL results + * it is conventionally {@code 1.0} but carries no additional information.</li> + * </ul> * * @since Apache Tika 4.0 */ public class EncodingResult { - /** Confidence value indicating a definitive, structural detection. */ - public static final float CONFIDENCE_DEFINITIVE = 1.0f; + /** + * The nature of the evidence that produced this result. + */ + public enum ResultType { + /** + * The document explicitly declared its encoding (BOM, HTML meta charset). + * Authoritative about author intent; preferred over inferred results when + * consistent with the actual bytes. + */ + DECLARATIVE, + /** + * The encoding is proven by byte-grammar structure (ISO-2022 escape + * sequences, UTF-8 multibyte validation). Not a guess — the byte + * patterns are only valid in this encoding. + */ + STRUCTURAL, + /** + * Probabilistic inference from a statistical model. The confidence + * float is meaningful for ranking among candidates. + */ + STATISTICAL + } private final Charset charset; private final float confidence; @@ -47,28 +77,53 @@ public class EncodingResult { * prediction without going through {@code Charset.name()}. */ private final String label; + private final ResultType resultType; /** + * Constructs a STATISTICAL result. Existing detectors that do not yet + * classify their evidence type default to statistical (probabilistic) + * treatment, which is the safe, arbitratable assumption. + * * @param charset the detected charset; must not be {@code null} * @param confidence detection confidence in {@code [0.0, 1.0]} */ public EncodingResult(Charset charset, float confidence) { - this(charset, confidence, charset.name()); + this(charset, confidence, charset.name(), ResultType.STATISTICAL); } /** + * Constructs a STATISTICAL result with a detector-specific label. + * * @param charset the detected charset; must not be {@code null} * @param confidence detection confidence in {@code [0.0, 1.0]} * @param label the detector's original label (e.g. {@code "IBM420-ltr"}); * if {@code null}, defaults to {@code charset.name()} */ public EncodingResult(Charset charset, float confidence, String label) { + this(charset, confidence, label, ResultType.STATISTICAL); + } + + /** + * Constructs a result with an explicit {@link ResultType}. + * + * @param charset the detected charset; must not be {@code null} + * @param confidence detection confidence in {@code [0.0, 1.0]} + * @param label the detector's original label; if {@code null}, + * defaults to {@code charset.name()} + * @param resultType the nature of the evidence; must not be {@code null} + */ + public EncodingResult(Charset charset, float confidence, String label, + ResultType resultType) { if (charset == null) { throw new IllegalArgumentException("charset must not be null"); } + if (resultType == null) { + throw new IllegalArgumentException("resultType must not be null"); + } this.charset = charset; this.confidence = Math.max(0f, Math.min(1f, confidence)); this.label = (label != null) ? label : charset.name(); + this.resultType = resultType; } public Charset getCharset() { @@ -76,13 +131,25 @@ public class EncodingResult { } /** - * Detection confidence in {@code [0.0, 1.0]}. - * {@code 1.0} means definitive; lower values invite arbitration. + * Detection confidence in {@code [0.0, 1.0]}. Meaningful for ranking + * among {@link ResultType#STATISTICAL} candidates. For + * {@link ResultType#DECLARATIVE} and {@link ResultType#STRUCTURAL} results + * the value is conventionally {@code 1.0} but carries no additional + * information beyond the type itself. */ public float getConfidence() { return confidence; } + /** + * The nature of the evidence that produced this result. + * + * @see ResultType + */ + public ResultType getResultType() { + return resultType; + } + /** * The detector's original label for this result. Usually identical to * {@link #getCharset()}{@code .name()}, but preserved when the detector @@ -97,6 +164,7 @@ public class EncodingResult { public String toString() { String cs = charset.name(); String lbl = label.equals(cs) ? cs : label + "(" + cs + ")"; - return lbl + "@" + String.format(java.util.Locale.ROOT, "%.2f", confidence); + return lbl + "@" + String.format(java.util.Locale.ROOT, "%.2f", confidence) + + "[" + resultType + "]"; } } diff --git a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java new file mode 100644 index 0000000000..385f3edfbe --- /dev/null +++ b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.detect; + +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; +import java.util.Collections; +import java.util.List; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ParseContext; + +/** + * Encoding detector that extracts a declared charset from Tika metadata without + * reading any bytes from the stream. Returns a single + * {@link EncodingResult.ResultType#DECLARATIVE} result when a charset is found. + * + * <p>Two metadata keys are consulted in order: + * <ol> + * <li>{@link Metadata#CONTENT_TYPE} — the {@code charset} parameter of the + * HTTP/MIME Content-Type header (e.g. {@code text/html; charset=UTF-8}).</li> + * <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by parsers + * such as {@code RFC822Parser}, which splits Content-Type into a bare + * media-type key and a separate charset key.</li> + * </ol> + * + * <p>This detector is SPI-loaded in {@code tika-core} and therefore always present + * in the default encoding-detector chain. Its DECLARATIVE result is visible to + * {@code CharSoupEncodingDetector}, which can weigh it against structural or + * statistical evidence from other detectors.</p> + * + * @since Apache Tika 4.0 + */ +@TikaComponent(name = "metadata-charset-detector") +public class MetadataCharsetDetector implements EncodingDetector { + + @Override + public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, + ParseContext context) throws IOException { + Charset cs = charsetFromContentType(metadata); + if (cs == null) { + cs = charsetFromContentEncoding(metadata); + } + if (cs == null) { + return Collections.emptyList(); + } + return List.of(new EncodingResult(cs, 1.0f, cs.name(), + EncodingResult.ResultType.DECLARATIVE)); + } + + /** + * Returns the charset named in the {@code charset} parameter of the + * {@link Metadata#CONTENT_TYPE} value, or {@code null} if absent or unparseable. + */ + public static Charset charsetFromContentType(Metadata metadata) { + String contentType = metadata.get(Metadata.CONTENT_TYPE); + if (contentType == null) { + return null; + } + MediaType mediaType = MediaType.parse(contentType); + if (mediaType == null) { + return null; + } + String label = mediaType.getParameters().get("charset"); + return parseCharset(label); + } + + /** + * Returns the charset named in {@link Metadata#CONTENT_ENCODING}, or + * {@code null} if absent or unparseable. This key is used by + * {@code RFC822Parser} to expose the charset declared in MIME body-part + * headers when the bare media type is stored separately in + * {@link Metadata#CONTENT_TYPE}. + */ + public static Charset charsetFromContentEncoding(Metadata metadata) { + return parseCharset(metadata.get(Metadata.CONTENT_ENCODING)); + } + + private static Charset parseCharset(String label) { + if (label == null || label.isBlank()) { + return null; + } + Charset cs; + try { + cs = Charset.forName(label.trim()); + } catch (IllegalArgumentException e) { + return null; + } + return normalizeWhatwg(cs); + } + + /** + * Applies the critical WHATWG encoding-label normalizations that are universally + * applicable regardless of content type. The WHATWG encoding spec + * (https://encoding.spec.whatwg.org/) maps {@code ISO-8859-1}, {@code US-ASCII}, + * and their aliases to {@code windows-1252} because real-world content labeled + * with these names is almost always actually windows-1252. + */ + private static Charset normalizeWhatwg(Charset cs) { + if (cs == null) { + return null; + } + String name = cs.name(); + if (StandardCharsets.ISO_8859_1.name().equals(name) + || StandardCharsets.US_ASCII.name().equals(name)) { + try { + return Charset.forName("windows-1252"); + } catch (IllegalArgumentException e) { + return cs; + } + } + return cs; + } +} diff --git a/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java b/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java index 3c3ddfa627..e300adccac 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java @@ -85,7 +85,8 @@ public class OverrideEncodingDetector implements EncodingDetector { @Override public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, ParseContext parseContext) throws IOException { - return List.of(new EncodingResult(charset, EncodingResult.CONFIDENCE_DEFINITIVE)); + return List.of(new EncodingResult(charset, 1.0f, charset.name(), + EncodingResult.ResultType.DECLARATIVE)); } public Charset getCharset() { diff --git a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector index 4a812de77e..1c321a2921 100644 --- a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector +++ b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector @@ -13,3 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +# Both detectors are in the org.apache.tika.detect.* namespace and sort before +# all other detectors, ensuring DECLARATIVE evidence from in-band signals (BOM) +# and out-of-band declarations (HTTP/MIME headers) reaches CharSoupEncodingDetector +# before any statistical detector runs. +# +# Within the namespace, class-name order is: BOMDetector < MetadataCharsetDetector +# so BOM evidence (highest confidence) is recorded first. +org.apache.tika.detect.BOMDetector +org.apache.tika.detect.MetadataCharsetDetector + diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java index 3a60e551ba..35ead7bb8a 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java @@ -71,8 +71,8 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { * Symmetric confusable peer groups: within each group, encoding variants * (e.g. ISO-8859-6 vs windows-1256) produce different decoded text for the * same byte sequence (unlike ISO-8859-1 vs windows-1252 which are functional - * supersets). When the language-quality winner and a CONFIDENCE_DEFINITIVE - * declaration are in the same peer group, the language model cannot reliably + * supersets). When the language-quality winner and a DECLARATIVE result + * are in the same peer group, the language model cannot reliably * distinguish them — it merely reflects which variant happens to produce * Arabic (or Cyrillic, …) n-grams its training data favoured. * In that case we prefer the explicit declaration. @@ -158,25 +158,31 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector(); Charset bestCharset = langDetector.compareLanguageSignal(candidates); if (bestCharset == null) { + // Language signal inconclusive. When a DECLARATIVE result (HTML meta charset, + // BOM, HTTP Content-Type) exists and decodes the bytes at least as cleanly as + // the statistical fallback, trust the declaration. This covers: + // • Pure-ASCII probe (both decodings identical) — prefer the declared charset. + // • Probe with high bytes that are valid in BOTH charsets (e.g. Cyrillic in a + // page that starts with ASCII JavaScript) — the bytes look "clean" in both + // windows-1252 (decoded as Latin Extended) and windows-1251 (decoded as + // Cyrillic), so junkRatio cannot distinguish them; trust the declaration. Charset fallback = firstResult.getCharset(); String fallbackDecoded = candidates.get(fallback); float fallbackJunk = fallbackDecoded != null ? CharSoupLanguageDetector.junkRatio(fallbackDecoded) : 1f; - // If the fallback charset produces garbled output (replacement chars) but - // a definitive declaration decodes the bytes cleanly, the probe was likely - // too short or ASCII-only. Trust the explicit declaration in that case. Charset cleanerDeclared = null; - if (fallbackJunk > 0f) { - for (EncodingDetectorContext.Result r : context.getResults()) { - if (r.getConfidence() >= EncodingResult.CONFIDENCE_DEFINITIVE) { - String declaredDecoded = candidates.get(r.getCharset()); - float declaredJunk = declaredDecoded != null - ? CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f; - if (declaredJunk < fallbackJunk / 2) { - cleanerDeclared = r.getCharset(); - break; - } + for (EncodingDetectorContext.Result r : context.getResults()) { + if (r.getResultType() == EncodingResult.ResultType.DECLARATIVE) { + String declaredDecoded = candidates.get(r.getCharset()); + float declaredJunk = declaredDecoded != null + ? CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f; + // Trust the declaration when it decodes at least as cleanly as + // the statistical fallback (≤ junk). A declaration that produces + // MORE junk than the fallback is likely wrong (e.g. a lying BOM). + if (declaredJunk <= fallbackJunk) { + cleanerDeclared = r.getCharset(); + break; } } } @@ -188,22 +194,22 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { bestCharset = fallback; } - // If a structurally-declared charset (CONFIDENCE_DEFINITIVE, e.g. HTML meta tag) - // decodes the bytes to the same string as the language-quality winner, prefer - // the declaration. This validates the HTML header against the actual bytes: - // if they are functionally equivalent, trust the author's stated encoding. - // If they produce different text (a real conflict), the bytes win. + // If a DECLARATIVE result (e.g. HTML meta charset) decodes the bytes to the same + // string as the language-quality winner, prefer the declaration. This validates the + // declared encoding against the actual bytes: if they are functionally equivalent, + // trust the author's stated encoding. If they produce different text (a real conflict + // — e.g. a lying BOM or a wrong meta tag), the bytes win and the language scorer's + // choice stands. // - // Additionally, when the winner and the declared charset are in the same - // confusable peer group (e.g. ISO-8859-6 vs windows-1256) and the declared - // charset decodes cleanly (low junk ratio), the language model cannot - // reliably distinguish them — they both produce valid same-script text. - // In that case, prefer the explicit declaration over the model's guess. + // Additionally, when the winner and a DECLARATIVE charset are in the same confusable + // peer group (e.g. ISO-8859-6 vs windows-1256) and the declared charset decodes + // cleanly (low junk ratio), the language model cannot reliably distinguish them — + // they both produce valid same-script text. Prefer the explicit declaration. String winnerDecoded = candidates.get(bestCharset); float winnerJunk = winnerDecoded != null ? CharSoupLanguageDetector.junkRatio(winnerDecoded) : 1f; if (winnerDecoded != null) { for (EncodingDetectorContext.Result r : context.getResults()) { - if (r.getConfidence() >= EncodingResult.CONFIDENCE_DEFINITIVE + if (r.getResultType() == EncodingResult.ResultType.DECLARATIVE && !r.getCharset().equals(bestCharset)) { Charset declared = r.getCharset(); String declaredDecoded = candidates.get(declared); @@ -214,11 +220,9 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { context.setArbitrationInfo("scored-prefer-declared"); return declared; } - // When the winner and the declared charset are in the same confusable - // peer group (e.g. ISO-8859-6 vs windows-1256), and the declared - // charset decodes at least as cleanly as the winner (not junkier), - // prefer the explicit declaration — the language model cannot reliably - // distinguish same-script encoding variants. + // Same-script peer group: language model cannot distinguish variants + // (e.g. ISO-8859-6 vs windows-1256 both produce valid Arabic text). + // Prefer the declaration when it decodes at least as cleanly as the winner. float declaredJunk = CharSoupLanguageDetector.junkRatio(declaredDecoded); if (arePeers(bestCharset, declared) && declaredJunk <= winnerJunk) { context.setArbitrationInfo("scored-prefer-declared-peer"); diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java index 832e195d07..f4f24307cf 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java @@ -191,10 +191,14 @@ public class CharSoupEncodingDetectorTest { assertEquals(0.25f, CharSoupLanguageDetector.junkRatio("abc\u0080"), 0.001f); - // Mixed: \r\n are control chars too - assertEquals(2f / 13f, + // \r and \n are ordinary whitespace — not junk + assertEquals(0f, CharSoupLanguageDetector.junkRatio("hello world\r\n"), 0.001f); + // Non-whitespace C1 control char mixed with ordinary whitespace + assertEquals(1f / 14f, + CharSoupLanguageDetector.junkRatio("hello world\r\n\u0080"), 0.001f); + // Empty/null assertEquals(0f, CharSoupLanguageDetector.junkRatio(""), 0.001f); assertEquals(0f, CharSoupLanguageDetector.junkRatio(null), 0.001f); diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java index 9487cad414..c2fe6dac76 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java @@ -171,7 +171,8 @@ public class HtmlEncodingDetector implements EncodingDetector { if (charset == null) { return Collections.emptyList(); } - return List.of(new EncodingResult(charset, EncodingResult.CONFIDENCE_DEFINITIVE)); + return List.of(new EncodingResult(charset, 1.0f, charset.name(), + EncodingResult.ResultType.DECLARATIVE)); } //returns null if no charset was found diff --git a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java index 9c7ff115eb..8c7ca1536f 100644 --- a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java @@ -16,8 +16,6 @@ */ package org.apache.tika.parser.html.charsetdetector; -import static org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel; - import java.io.IOException; import java.io.InputStream; import java.nio.charset.Charset; @@ -29,54 +27,53 @@ import org.apache.commons.io.input.BoundedInputStream; import org.apache.tika.config.TikaComponent; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.detect.EncodingResult; +import org.apache.tika.detect.MetadataCharsetDetector; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; /** - * An encoding detector that tries to respect the spirit of the HTML spec - * part 12.2.3 "The input byte stream", or at least the part that is compatible with - * the implementation of tika. - * <p> - * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream - * <p> - * If a resource was fetched over HTTP, then HTTP headers should be added to tika metadata - * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it may contain - * charset information. - * <p> - * This encoding detector may return null if no encoding is detected. - * It is meant to be used inside a {@link org.apache.tika.detect.CompositeEncodingDetector}. - * For instance: - * <pre> {@code - * EncodingDetector detector = new CompositeEncodingDetector( - * Arrays.asList( - * new StandardHtmlEncodingDetector(), - * new Icu4jEncodingDetector())); - * }</pre> - * <p> + * An encoding detector that respects the HTML5 encoding-sniff algorithm + * (https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream): + * BOM → HTTP Content-Type header → {@code <meta charset>} / {@code <meta http-equiv>} tag. + * + * <p>When used standalone (outside a {@link org.apache.tika.detect.CompositeEncodingDetector} + * chain) this detector handles the full spec algorithm including BOM detection. + * + * <p>When used inside the default Tika chain (with {@code BOMDetector} and + * {@code MetadataCharsetDetector} already present), set {@code skipBOM=true} so that + * this detector focuses exclusively on the HTML {@code <meta>} scan. That lets + * {@code CharSoupEncodingDetector} arbitrate between a BOM declaration and a + * contradicting {@code <meta>} declaration instead of silently suppressing one. + * + * <p>HTTP/MIME Content-Type and Content-Encoding metadata are always read here for + * standalone compatibility; in the chain they will already have been returned by + * {@code MetadataCharsetDetector} and {@code CharSoup} will handle the duplication + * gracefully (identical DECLARATIVE results agree, so no harm done). */ @TikaComponent(name = "standard-html-encoding-detector") public final class StandardHtmlEncodingDetector implements EncodingDetector { - private static final int META_TAG_BUFFER_SIZE = 8192; + /** + * Default number of bytes to scan for a {@code <meta charset>} declaration. + * 65536 is large enough to cover typical {@code <script>} or {@code <style>} + * blocks in the {@code <head>} without significant overhead (encoding detection + * already buffers the stream). Users who need to handle even deeper declarations + * can raise this via {@link #setMarkLimit(int)}. + */ + private static final int META_TAG_BUFFER_SIZE = 65536; private int markLimit = META_TAG_BUFFER_SIZE; /** - * Extracts a charset from a Content-Type HTTP header. + * When {@code true}, the BOM check is skipped and the detector goes directly to + * the Content-Type header and {@code <meta>} scan. Use this when + * {@code BOMDetector} is already present in the chain so that + * {@code CharSoupEncodingDetector} can arbitrate between a BOM declaration and a + * contradicting {@code <meta charset>} rather than having the BOM silently win. * - * @param metadata parser metadata - * @return a charset if there is one specified, or null + * <p>Default: {@code false} (HTML5 spec-compliant standalone behaviour).</p> */ - private static Charset charsetFromContentType(Metadata metadata) { - String contentType = metadata.get(Metadata.CONTENT_TYPE); - MediaType mediatype = MediaType.parse(contentType); - if (mediatype == null) { - return null; - } - String charsetLabel = mediatype.getParameters().get("charset"); - return getCharsetByLabel(charsetLabel); - } + private boolean skipBOM = false; @Override public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata, @@ -87,10 +84,19 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { .setInputStream(tis).setMaxCount(limit).get(); PreScanner preScanner = new PreScanner(limitedStream); - // Priority: 1. BOM 2. Content-Type HTTP header 3. HTML <meta> tag - Charset detectedCharset = preScanner.detectBOM(); + Charset detectedCharset = null; + + if (!skipBOM) { + // HTML5 spec: BOM overrides everything. When used standalone this + // detector is responsible for BOM detection; when used in the chain with + // BOMDetector, setting skipBOM=true lets CharSoup arbitrate. + detectedCharset = preScanner.detectBOM(); + } + if (detectedCharset == null) { + detectedCharset = MetadataCharsetDetector.charsetFromContentType(metadata); + } if (detectedCharset == null) { - detectedCharset = charsetFromContentType(metadata); + detectedCharset = MetadataCharsetDetector.charsetFromContentEncoding(metadata); } if (detectedCharset == null) { detectedCharset = preScanner.scan(); @@ -100,7 +106,8 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { if (detectedCharset == null) { return Collections.emptyList(); } - return List.of(new EncodingResult(detectedCharset, EncodingResult.CONFIDENCE_DEFINITIVE)); + return List.of(new EncodingResult(detectedCharset, 1.0f, + detectedCharset.name(), EncodingResult.ResultType.DECLARATIVE)); } public int getMarkLimit() { @@ -108,10 +115,24 @@ public final class StandardHtmlEncodingDetector implements EncodingDetector { } /** - * How far into the stream to read for charset detection. - * Default is 8192. + * How far into the stream to scan for a {@code <meta charset>} declaration. + * Default is {@value #META_TAG_BUFFER_SIZE} bytes. */ public void setMarkLimit(int markLimit) { this.markLimit = markLimit; } + + public boolean isSkipBOM() { + return skipBOM; + } + + /** + * When {@code true}, skip the BOM check and rely on {@code BOMDetector} in the + * chain. This allows {@code CharSoupEncodingDetector} to arbitrate between a + * BOM and a contradicting {@code <meta charset>} declaration. + * Default is {@code false}. + */ + public void setSkipBOM(boolean skipBOM) { + this.skipBOM = skipBOM; + } } diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml b/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml index c96b3a7ff0..985202bf36 100644 --- a/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml @@ -41,6 +41,12 @@ <artifactId>tika-core</artifactId> <version>${revision}</version> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${revision}</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-core</artifactId> @@ -48,6 +54,12 @@ <type>test-jar</type> <scope>test</scope> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-encoding-detector-universal</artifactId> + <version>${revision}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-serialization</artifactId> diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java index 3cf4b435c7..04666b1726 100644 --- a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java @@ -129,6 +129,7 @@ public class CharsetDetectorTest extends TikaTest { assertEquals("UTF-8", detector.detect().getName()); } + @org.junit.jupiter.api.Disabled("Integration test requiring TXT parser — run via tika-parser-text-module") @Test public void testIgnoreCharset() throws Exception { //TIKA-3516, TIKA-3525, TIKA-1236 diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json new file mode 100644 index 0000000000..82442e13a2 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json @@ -0,0 +1,13 @@ +{ + "parsers": [ + "default-parser" + ], + "encoding-detectors": [ + { + "icu4j-encoding-detector": { + "ignoreCharsets": ["IBM420", "IBM424"] + } + }, + "universal-encoding-detector" + ] +} diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt new file mode 100644 index 0000000000..ab78d1c2f6 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt @@ -0,0 +1,58 @@ +English: ABC Category + +en ABC adhoc internship +en ABC agent uno well connected +en ABC agent uno good connections +en ABC really need uno agent +en ABC special retainer +en ABC let's get this through +en ABC let's push this through +en ABC Token of gratitude +en ABC do not expense +en ABC don't expense +en ABC want to keep you happy +en ABC win win payment +en ABC win win hire +en ABC donation uno business + +Portuguese: ABC Category + +Port ABC pagamento de sucesso +Port ABC pagamento de sucesso +Port ABC presentinho uno agradecimento +Port ABC taxa especial +Port ABC taxa especial +Port ABC realmente preciso uno agente +Port ABC vamos empurrar isso +Port ABC vamos empurrar isso +Port ABC Vamos acabar com isso +Port ABC te deixar feliz +Port ABC te deixar feliz +Port ABC nao contabilize +Port ABC n�o contabilize +Port ABC nao contabilize +Port ABC n�o contabilize +Port ABC doa��o uno neg�cios +Port ABC doacao uno negocios +Port ABC agente uno bem conectado +Port ABC agente uno bons contatos +Port ABC estagio adhoc +Port ABC contratacao de sucesso� +Port ABC contrata��o de sucesso� + +Spanish: ABC Category + +espa�ol ABC adhoc pr�cticas +espa�ol ABC intermediario uno bien conectado +espa�ol ABC agente uno buenas conecciones +espa�ol ABC realmente necesito uno un agente +espa�ol ABC anticipo especial +espa�ol ABC hay que pasar +espa�ol ABC hay que forzar +espa�ol ABC muestra de gratitud +espa�ol ABC no registre gasto +espa�ol ABC no reporte gasto +espa�ol ABC mantenerte contento +espa�ol ABC todos ganan con el anticipo +espa�ol ABC donacion uno negocios +espa�ol ABC todos ganan con la contratacion diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html new file mode 100644 index 0000000000..3e55dcd116 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html @@ -0,0 +1,99 @@ +<div class="js-helper"> + <style type="text/css">#style_13209008630000000884_BODY{background-color:#FFFFFF;color:#000000;MARGIN:0px 1px;font-family:Tahoma,Arial,Verdana,Sans-Serif}#style_13209008630000000884 TD{font-size:13px;font-family:Tahoma,Arial,Verdana,Sans-Serif;vertical-align:top}#style_13209008630000000884 CAPTION{font-size:13px;font-weight:bold;text-align:left}#style_13209008630000000884 TR.style_13209008630000000884thead TD{font-weight:bold;text-align:center; padding-bottom:6px;padding-top:6px;padd [...] + + </style> + <div class="mr_read__body" id="style_13209008630000000884"> + <base href="http://e.mail.ru/cgi-bin/" target="_self"/> + + <div id="style_13209008630000000884_BODY"> + + + <style type="text/css"></style> + + + <table border="0" cellpadding="0" cellspacing="0" height="100%" width="100%"> + <tr> + <td> + + </td> + </tr> + <tr> + <td height="100%" style="padding:5px"> + Здравствуйте, !<br> + <br> + Предлагаем Вам ознакомиться со списком зарегистрированных компаний, представители которых + просмотрели Ваше резюме за последние сутки.<br> + <br> + <li><a href="/cgi-bin/link?check=1&cnf=710139&url=http%3A%2F%2;0,0" target="_blank">Компании, + просмотревшие резюме № .</a> Новые: <b>1.</b></li> + <br> + <br> + Эти сведения предоставляются Вам исключительно для информации. Вы можете оперативно отслеживать, + какие именно компании нашли в базе данных Superjob Ваше резюме и заинтересовались им.<br> + <br> + Если Ваше резюме размещено в закрытом доступе, то его могут просматривать только те + работодатели, которым Вы отправили его самостоятельно.<br> + Историю отправки своего резюме Вы можете посмотреть по ссылке «История рассылки резюме».<br> + <br> + <br> + <b>Внимание!</b><br> + В процессе поиска работы Вы можете столкнуться с такими предложениями работодателей или кадровых + агентств, в которых Вас будут просить внести оплату (за предварительное обучение, за оформление + документов, за оформление обязательной страховки, на закупку первой партии продукции компании, + предназначенной для продажи и т.п.) или предоставить отсканированные копии документов (паспорта, + военного билета, трудовой книжки, водительских прав, пенсионного удостоверния и т.п.) для якобы + предварительного оформления или подтверждения данных, указанных в Вашем резюме.<br> + Это один из признаков мошенничества! Мы рекомендуем Вам очень осторожно относиться к таким + предложениям и по возможности избегать собеседований с подобными работодателями.<br> + <br> + Также мы настоятельно не рекомендуем отправлять платные SMS-сообщения на короткие номера для + получения контактов или другой информации о вакансии или же для получения результатов + тестирования. С организациями, которые оказывают подобные услуги, мы не сотрудничаем и + предупреждаем, что это тоже один из приемов мошенничества.<br> + <br> + <br> + <em>x</em> <a href="/cgi-bin/link?check=1&cnf=8d972a&url=http%3A%2F%2Fwww.sup;0,0" + target="_blank">Отключить + уведомления о новых просмотрах моих резюме</a><br> + <br> + По ссылкам в этом письме можно войти в систему без ввода пароля. + <br><br> + </td> + </tr> + <tr> + <td> + <span class="style_13209008630000000884noprint"><br><br>Если у Вас есть пожелания и идеи по улучшению сервиса Superjob, пожалуйста, <a + href="/cgi-bin/link?check=1;0,0" target="_blank">напишите нам</a>.<br><br></span> + <table border="0" cellpadding="10" cellspacing="0" class="style_13209008630000000884noprint" + width="100%"> + <tr> + <td align="center" style="border-top:1px solid #BACBD7;"> + <a href="/cgi-bin/link?check=1&cnf=8fa2f9&url=http%3A%2F%2Fwww.;0,0" + target="_blank"><big>Superjob — + Работа должна доставлять удовольствие!</big></a> + </td> + </tr> + </table> + <table border="0" cellpadding="0" cellspacing="1" class="style_13209008630000000884noprint" + width="100%"> + <tr> + <td align="center" style="padding:5px"> + <span style="color:#999999;font-size:8pt;">Письмо отправлено: xx.xx.xxxx xx:xx:xx</span> + </td> + </tr> + </table> + + </td> + </tr> + </table> + + + </div> + + + <base href="http://e.mail.ru/cgi-bin/" target="_self"/> + </div> +</div> + + + diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt new file mode 100644 index 0000000000..4673e04852 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt @@ -0,0 +1,4 @@ + +ACTIVE AGE + +BALM diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt new file mode 100644 index 0000000000..519c95565a --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt @@ -0,0 +1 @@ +These smart quotes are the trigger for CharsetRecog_sbcs to think this is a �windows� encoding \ No newline at end of file diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html new file mode 100644 index 0000000000..2aecab221d Binary files /dev/null and b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html differ diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java index 7a2b434d63..d920388072 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java @@ -94,7 +94,23 @@ public class MojibusterEncodingDetector implements EncodingDetector { * 0x40–0xFE), so their presence is definitive proof that a GB18030 codec * is required to avoid replacement characters. */ - GB_FOUR_BYTE_UPGRADE + GB_FOUR_BYTE_UPGRADE, + /** + * Upgrade an ISO-8859-X result to its Windows-12XX equivalent when the + * probe contains at least one CRLF pair ({@code 0x0D 0x0A}) but no C1 + * bytes ({@code 0x80–0x9F}). + * + * <p>Files originating on Windows use CRLF line endings. The presence + * of a {@code \r\n} pair in a probe that is otherwise 7-bit ASCII (or + * has only high bytes above {@code 0x9F}) is weak evidence of Windows + * origin and therefore of a Windows code page. {@link Rule#ISO_TO_WINDOWS} + * already handles the C1-byte case definitively; this rule covers the + * weaker case where C1 bytes have not been seen but CRLF line endings + * suggest Windows origin. A bare {@code 0x0D} (old Mac Classic CR-only + * line ending) does <em>not</em> trigger this rule. Mirrors the legacy + * {@code UniversalEncodingListener.report()} heuristic.</p> + */ + CRLF_TO_WINDOWS } private static final long serialVersionUID = 1L; @@ -278,28 +294,28 @@ public class MojibusterEncodingDetector implements EncodingDetector { } byte[] probe = readProbe(input, maxProbeBytes); + // Strip BOM bytes before feature extraction. BOM detection is handled + // by BOMDetector (which runs earlier in the chain and returns DECLARATIVE). + // BOMs are excluded from training data; stripping ensures consistent + // model inference. A lying BOM (e.g. UTF-8 BOM on a Windows-1252 file) + // is caught by CharSoup comparing BOMDetector's DECLARATIVE claim against + // Mojibuster's byte-content analysis. probe = stripBom(probe); - if (probe.length == 0) { - return singleResult(StandardCharsets.UTF_8.name(), EncodingResult.CONFIDENCE_DEFINITIVE, Integer.MAX_VALUE); - } - + // An empty probe (e.g. empty file, or a file that was only a BOM) falls + // through to detectAll where isPureAscii returns true for a zero-length + // array, yielding the same windows-1252 default as any other pure-ASCII probe. return detectAll(probe, Integer.MAX_VALUE); } /** - * Structural gates: deterministic early exits before the general model runs. - * Only bulletproof, zero-false-positive checks belong here. - * EBCDIC discrimination is intentionally absent — it is handled by the sub-model. - */ - /** - * Applies structural encoding rules that produce CONFIDENCE_DEFINITIVE results. - * Returns non-null only when a byte-level pattern unambiguously identifies the charset - * (e.g. ISO-2022 escape sequences, sparse valid UTF-8 multibyte sequences). - * Pure ASCII is deliberately excluded here — ASCII is compatible with virtually all - * single-byte encodings, so it is NOT definitive. Use {@link #applyAsciiHeuristic} - * for the ASCII case. + * Applies structural encoding rules that produce {@link EncodingResult.ResultType#STRUCTURAL} + * results. Returns non-null only when a byte-level pattern unambiguously identifies the + * charset (ISO-2022 escape sequences, sparse valid UTF-8 multibyte sequences). + * + * Pure ASCII is deliberately excluded — ASCII is compatible with virtually all + * single-byte encodings and is not structurally definitive. */ - private Charset applyDefinitiveStructuralRules(byte[] probe) { + private Charset applyStructuralRules(byte[] probe) { // ISO-2022 before ASCII: all three variants are 7-bit so checkAscii fires first. Charset iso2022 = StructuralEncodingRules.detectIso2022(probe); if (iso2022 != null) { @@ -327,11 +343,11 @@ public class MojibusterEncodingDetector implements EncodingDetector { } /** - * Returns UTF-8 if the probe is pure 7-bit ASCII (no bytes ≥ 0x80, no null bytes). - * ASCII is a strict subset of UTF-8 and of every single-byte encoding, so this is - * a heuristic only — the confidence returned by the caller must be below - * CONFIDENCE_DEFINITIVE to allow downstream detectors (e.g. HTML meta charset) to - * override it without ambiguity. + * Returns true if the probe is pure 7-bit ASCII (no bytes ≥ 0x80, no null bytes). + * ASCII is compatible with virtually every single-byte encoding, so this is a + * heuristic — we report US-ASCII to honestly reflect what the probe showed. + * CharSoup will upgrade to a declared encoding (e.g. ISO-8859-15) when the document + * contains an explicit declaration consistent with the ASCII bytes. */ private static boolean isPureAscii(byte[] probe) { return StructuralEncodingRules.checkAscii(probe) && !hasNullBytes(probe); @@ -396,17 +412,24 @@ public class MojibusterEncodingDetector implements EncodingDetector { boolean gates = enabledRules.contains(Rule.STRUCTURAL_GATES); if (gates) { - // Definitive structural rules (BOM, ISO-2022, sparse UTF-8) → CONFIDENCE_DEFINITIVE. - Charset definitive = applyDefinitiveStructuralRules(probe); - if (definitive != null) { - return singleResult(definitive.name(), EncodingResult.CONFIDENCE_DEFINITIVE, topN); + // Structural rules: byte-grammar proof (ISO-2022, sparse UTF-8). + Charset structural = applyStructuralRules(probe); + if (structural != null) { + return singleResult(structural.name(), 1.0f, + EncodingResult.ResultType.STRUCTURAL, topN); } - // ASCII heuristic: pure 7-bit ASCII is valid in virtually every single-byte - // encoding, so we report UTF-8 with a sub-definitive confidence. This allows - // an HTML meta-charset declaration (which IS CONFIDENCE_DEFINITIVE) to override - // the ASCII heuristic without non-deterministic tie-breaking. + // Pure ASCII: no high bytes seen in the probe. We default to windows-1252 — + // the WHATWG-canonical "Western Latin, I saw only ASCII bytes" encoding. + // HTML5 explicitly defines ISO-8859-1 as an alias for windows-1252, making + // windows-1252 the right default: it is the correct superset, it avoids the + // ambiguity between ISO-8859-1 and windows-1252 in the 0x80–0x9F range, and + // it keeps the no-hint path consistent with the HTML-spec path (where a stated + // "charset=iso-8859-1" is normalized to windows-1252 by StandardHtmlEncodingDetector). + // CharSoup will further upgrade to any compatible DECLARATIVE encoding + // (e.g. an HTML meta charset=UTF-8) when one is present and consistent. if (isPureAscii(probe)) { - return singleResult(StandardCharsets.UTF_8.name(), 0.75f, topN); + return singleResult("windows-1252", 0.5f, + EncodingResult.ResultType.STATISTICAL, topN); } } @@ -420,14 +443,12 @@ public class MojibusterEncodingDetector implements EncodingDetector { return runEbcdicSubModel(probe, topN); } - // Grammar filtering can leave the list empty on very short probes (e.g. a single - // high byte that is a valid CJK lead but has an invalid trail byte). Fall back to - // UTF-8 rather than returning empty and causing AutoDetectReader to throw. + // If the model had no evidence (probe too short or all tokens filtered), fall back to + // windows-1252 at very low confidence rather than returning empty and letting + // AutoDetectReader throw. CharSoup will override this with any DECLARATIVE hint. if (results.isEmpty()) { - return singleResult(StandardCharsets.UTF_8.name(), - EncodingResult.CONFIDENCE_DEFINITIVE / 2, Integer.MAX_VALUE); + return singleResult("windows-1252", 0.1f, EncodingResult.ResultType.STATISTICAL, topN); } - return results; } @@ -443,11 +464,29 @@ public class MojibusterEncodingDetector implements EncodingDetector { } } - List<EncodingResult> results = selectByLogitGap(model, logits, topN); + // For short probes the model has limited signal; widen the candidate set so + // that CharSoup's language arbitration can rescue the correct answer even when + // the gap between competitors exceeds LOGIT_GAP. + List<EncodingResult> results; + if (probe.length < 50) { + results = selectTopN(model, logits, 3); + } else if (probe.length < 100) { + results = selectTopN(model, logits, 2); + } else { + results = selectByLogitGap(model, logits, topN); + } if (enabledRules.contains(Rule.ISO_TO_WINDOWS) && StructuralEncodingRules.hasC1Bytes(probe)) { results = upgradeIsoToWindows(results); } + // CRLF_TO_WINDOWS: when C1 bytes were absent (ISO_TO_WINDOWS didn't fire) but + // CRLF pairs suggest Windows line endings, apply the same ISO→Windows upgrade as + // weak evidence of Windows file origin. If ISO_TO_WINDOWS already fired, the + // results are already Windows-12XX and upgradeIsoToWindows is a no-op. + // Bare CR (old Mac Classic line endings) does NOT trigger this rule. + if (enabledRules.contains(Rule.CRLF_TO_WINDOWS) && StructuralEncodingRules.hasCrlfBytes(probe)) { + results = upgradeIsoToWindows(results); + } if (enabledRules.contains(Rule.CJK_GRAMMAR)) { results = refineCjkResults(probe, results); } @@ -475,6 +514,78 @@ public class MojibusterEncodingDetector implements EncodingDetector { * hiding plausible alternatives that downstream arbitrators (e.g. CharSoup) * should evaluate. Linear confidence within the gap preserves the signal.</p> */ + /** + * Maximum confidence assigned to a STATISTICAL model result. Kept strictly + * below 1.0 so that statistical results are never mistaken for STRUCTURAL or + * DECLARATIVE evidence by downstream arbitrators (e.g. CharSoupEncodingDetector). + * The top result from the logit-gap window always maps to this value. + */ + private static final float MAX_STATISTICAL_CONFIDENCE = 0.99f; + + /** + * Return the top {@code n} single-byte/CJK candidates by logit rank, regardless of gap. + * Used for short probes where the model has limited signal and we want + * CharSoup's language arbitration to have multiple candidates to compare. + * <p> + * Wide encodings (UTF-16/32) are excluded: stride-2 features can spuriously + * boost them on very short probes that lack the null-byte density that + * genuinely characterises UTF-16/32. A 9-byte filename without a BOM is + * never UTF-16/32 in practice. + * <p> + * Confidence is still scaled relative to the logit-gap window so that + * results remain in the statistical range below DECLARATIVE/STRUCTURAL. + */ + private static List<EncodingResult> selectTopN(LinearModel m, float[] logits, int n) { + // Collect all positive-logit, non-excluded candidates with their array index. + // We sort by RAW LOGIT (not sigmoid) so that the model's actual ranking is + // preserved even when all logits are large-positive (sigmoid ≈ 1.0 for all). + // Example: GB18030=43, EUC-JP=28, Big5=21 — all sigmoid≈0.99 — would tie on + // sigmoid and fall back to label-insertion order; sorting by logit keeps GB18030 first. + List<int[]> candidates = new ArrayList<>(); // [label-index] + for (int i = 0; i < logits.length; i++) { + // logit ≤ 0 means sigmoid ≤ 0.5 — the model actively disfavours this encoding. + if (logits[i] <= 0) { + continue; + } + String lbl = m.getLabel(i); + if (isExcludedFromShortProbe(lbl)) { + continue; + } + if (labelToCharset(lbl) == null) { + continue; + } + candidates.add(new int[]{i}); + } + // Sort descending by logit so model ranking is preserved. + candidates.sort((a, b) -> Float.compare(logits[b[0]], logits[a[0]])); + + // Take the top N and assign sigmoid confidence so downstream code has a meaningful score. + List<EncodingResult> result = new ArrayList<>(Math.min(n, candidates.size())); + for (int rank = 0; rank < Math.min(n, candidates.size()); rank++) { + int i = candidates.get(rank)[0]; + String lbl = m.getLabel(i); + Charset cs = labelToCharset(lbl); + float conf = (1f / (1f + (float) Math.exp(-logits[i]))) * MAX_STATISTICAL_CONFIDENCE; + result.add(new EncodingResult(cs, conf, lbl, EncodingResult.ResultType.STATISTICAL)); + } + return result; + } + + /** + * Returns true for encodings that should be excluded from short-probe top-N selection. + * <ul> + * <li>Wide encodings (UTF-16/32): stride-2 features spuriously boost them on short + * probes that lack the null-byte density that genuinely characterises UTF-16/32.</li> + * <li>EBCDIC family (IBM4xx, IBM500, "EBCDIC" routing label): EBCDIC has its own + * dedicated sub-model pipeline and should never surface as a candidate for + * short single-byte or CJK content.</li> + * </ul> + */ + private static boolean isExcludedFromShortProbe(String label) { + return label.startsWith("UTF-16") || label.startsWith("UTF-32") + || label.startsWith("IBM") || label.equals("EBCDIC"); + } + private static List<EncodingResult> selectByLogitGap(LinearModel m, float[] logits, int topN) { float maxLogit = Float.NEGATIVE_INFINITY; for (float l : logits) { @@ -486,11 +597,14 @@ public class MojibusterEncodingDetector implements EncodingDetector { List<EncodingResult> results = new ArrayList<>(); for (int i = 0; i < logits.length; i++) { if (logits[i] >= floor) { - float conf = (logits[i] - floor) / LOGIT_GAP; + // Scale to [0, MAX_STATISTICAL_CONFIDENCE] so no statistical result + // reaches 1.0, keeping the range unambiguously below STRUCTURAL/DECLARATIVE. + float conf = ((logits[i] - floor) / LOGIT_GAP) * MAX_STATISTICAL_CONFIDENCE; String lbl = m.getLabel(i); Charset cs = labelToCharset(lbl); if (cs != null) { - results.add(new EncodingResult(cs, conf, lbl)); + results.add(new EncodingResult(cs, conf, lbl, + EncodingResult.ResultType.STATISTICAL)); } } } @@ -552,7 +666,11 @@ public class MojibusterEncodingDetector implements EncodingDetector { return results; } - // Score every CJK charset in the result list. + // Grammar-filter CJK charsets: drop those that produce invalid byte sequences + // (score == 0 means the grammar walker found bad bytes — the model was wrong). + // Charsets that pass grammar keep their model confidence unchanged so that + // all candidates remain on the same sigmoid scale for CharSoup to compare. + // Non-CJK charsets pass through unchanged. List<EncodingResult> refined = new ArrayList<>(results.size()); for (EncodingResult er : results) { if (!CjkEncodingRules.isCjk(er.getCharset())) { @@ -561,17 +679,13 @@ public class MojibusterEncodingDetector implements EncodingDetector { } int score = CjkEncodingRules.match(probe, er.getCharset()); if (score == 0) { - // grammar rejects this charset — drop entirely - } else if (score >= CjkEncodingRules.CLEAN_SHORT_PROBE_CONFIDENCE) { - // structurally clean (bad == 0) — use grammar confidence - refined.add(new EncodingResult(er.getCharset(), score / 100f)); - } else { - // some bad bytes within tolerance — keep model confidence - refined.add(er); + // grammar rejects this charset entirely — drop it + continue; } + // Grammar passes: keep the model's sigmoid confidence so everything + // is on the same scale when CharSoup compares candidates. + refined.add(er); } - - refined.sort((a, b) -> Float.compare(b.getConfidence(), a.getConfidence())); return refined; } @@ -617,7 +731,8 @@ public class MojibusterEncodingDetector implements EncodingDetector { return upgraded; } - private static List<EncodingResult> singleResult(String label, float confidence, int topN) { + private static List<EncodingResult> singleResult(String label, float confidence, + EncodingResult.ResultType type, int topN) { if (topN <= 0) { return Collections.emptyList(); } @@ -625,7 +740,7 @@ public class MojibusterEncodingDetector implements EncodingDetector { if (cs == null) { return Collections.emptyList(); } - return List.of(new EncodingResult(cs, confidence, label)); + return List.of(new EncodingResult(cs, confidence, label, type)); } /** @@ -669,6 +784,10 @@ public class MojibusterEncodingDetector implements EncodingDetector { return model; } + public LinearModel getEbcdicModel() { + return ebcdicModel; + } + public EnumSet<Rule> getEnabledRules() { return EnumSet.copyOf(enabledRules.isEmpty() ? EnumSet.noneOf(Rule.class) : enabledRules); } diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java index a87dd71410..40d409cae6 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java @@ -373,6 +373,35 @@ public final class StructuralEncodingRules { */ private static final double IBM500_LATIN_THRESHOLD = 0.25; + /** + * Returns {@code true} if the probe contains at least one CRLF pair + * ({@code 0x0D 0x0A}). + * + * <p>Files originating on Windows use CRLF as the line separator. + * The presence of a {@code 0x0D 0x0A} pair in a probe that is otherwise + * 7-bit ASCII is weak evidence that the file was created on Windows and + * therefore more likely to use a Windows code page (e.g. windows-1252) + * than a Unix-origin ISO-8859-X encoding for any high-byte content + * beyond the probe window.</p> + * + * <p>A bare {@code 0x0D} without a following {@code 0x0A} is <em>not</em> + * counted: classic Mac OS used bare CR as its line ending, and that is a + * different case that does not imply Windows origin.</p> + */ + public static boolean hasCrlfBytes(byte[] bytes) { + return hasCrlfBytes(bytes, 0, bytes.length); + } + + public static boolean hasCrlfBytes(byte[] bytes, int offset, int length) { + int end = offset + length; + for (int i = offset; i < end - 1; i++) { + if (bytes[i] == 0x0D && bytes[i + 1] == 0x0A) { + return true; + } + } + return false; + } + /** * Returns {@code true} if the probe contains any byte in the C1 control * range {@code 0x80–0x9F}. diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin index f76be8560d..191cdaeedf 100644 Binary files a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin and b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin differ diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin index 31dd657c92..70421e6bb8 100644 Binary files a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin and b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin differ diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java new file mode 100644 index 0000000000..60c0803d43 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java @@ -0,0 +1,132 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; + +import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.List; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.ml.LinearModel; +import org.apache.tika.parser.ParseContext; + +/** + * Verifies the two-phase EBCDIC detection pipeline: + * <ol> + * <li>General model emits {@code "EBCDIC"} routing label for EBCDIC-family bytes.</li> + * <li>{@code MojibusterEncodingDetector} routes to the EBCDIC sub-model, which + * returns a specific IBM variant (IBM500, IBM420, IBM855, etc.) — never the + * bare {@code "EBCDIC"} routing label.</li> + * </ol> + */ +public class EbcdicRoutingTest { + + private static MojibusterEncodingDetector detector; + + // Representative English prose encoded in IBM500 (International EBCDIC). + // Generated via: text.getBytes(Charset.forName("IBM500")) + private static final byte[] IBM500_BYTES = makeEbcdic("IBM500", + "The quick brown fox jumps over the lazy dog. " + + "This sentence contains every letter of the English alphabet. " + + "EBCDIC encoding is used on IBM mainframe systems. " + + "Fields are often fixed-width and space-padded in EBCDIC files."); + + // Russian text encoded in IBM855 (Cyrillic EBCDIC). + private static final byte[] IBM855_BYTES = makeEbcdic("IBM855", + "\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440! " + // Привет мир! + "\u042d\u0442\u043e \u0442\u0435\u043a\u0441\u0442 \u043d\u0430 " + // Это текст на + "\u0440\u0443\u0441\u0441\u043a\u043e\u043c \u044f\u0437\u044b\u043a\u0435. " + // русском языке. + "\u041a\u043e\u0434\u0438\u0440\u043e\u0432\u043a\u0430 IBM855 " + // Кодировка IBM855 + "\u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442\u0441\u044f " + // используется + "\u043d\u0430 \u043c\u0435\u0439\u043d\u0444\u0440\u0435\u0439\u043c\u0430\u0445."); // на мейнфреймах. + + private static byte[] makeEbcdic(String charsetName, String text) { + try { + return text.getBytes(Charset.forName(charsetName)); + } catch (Exception e) { + throw new RuntimeException("Cannot encode test data as " + charsetName, e); + } + } + + @BeforeAll + static void setUp() { + detector = new MojibusterEncodingDetector(); + } + + /** + * The general model must have exactly one EBCDIC routing label. + * Individual IBM variants must NOT appear as top-level labels — they live + * only in the EBCDIC sub-model. + */ + @Test + public void generalModelHasSingleEbcdicRoutingLabel() { + LinearModel general = detector.getModel(); + String[] labels = general.getLabels(); + + assertTrue(Arrays.asList(labels).contains("EBCDIC"), + "General model must have an 'EBCDIC' routing label"); + + // No individual IBM variant should appear as a direct label in the general model — + // they live only in the EBCDIC sub-model + for (String label : labels) { + assertFalse(label.startsWith("IBM"), + "General model must not contain individual IBM variant: " + label); + } + } + + /** + * IBM500 bytes must route through the sub-model and return a specific IBM variant, + * not the bare "EBCDIC" routing label. + */ + @Test + public void ibm500RoutesToSubModel() throws Exception { + try (TikaInputStream tis = TikaInputStream.get(IBM500_BYTES)) { + List<EncodingResult> results = detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), "Should detect something for IBM500 bytes"); + String topLabel = results.get(0).getLabel(); + assertNotEquals("EBCDIC", topLabel, + "Result must be a specific IBM variant, not the routing label"); + assertTrue(topLabel.startsWith("IBM"), + "Result should be an IBM variant, got: " + topLabel); + } + } + + /** + * IBM855 (Cyrillic EBCDIC) bytes must similarly route through the sub-model. + */ + @Test + public void ibm855RoutesToSubModel() throws Exception { + try (TikaInputStream tis = TikaInputStream.get(IBM855_BYTES)) { + List<EncodingResult> results = detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), "Should detect something for IBM855 bytes"); + String topLabel = results.get(0).getLabel(); + assertNotEquals("EBCDIC", topLabel, + "Result must be a specific IBM variant, not the routing label"); + assertTrue(topLabel.startsWith("IBM"), + "Result should be an IBM variant, got: " + topLabel); + } + } +} diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java index 8c94ec1e62..e5ff43e10e 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java @@ -20,31 +20,34 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.charset.Charset; -import java.util.Arrays; import java.util.List; -import java.util.Locale; import org.junit.jupiter.api.Test; +import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.EncodingDetectorContext; import org.apache.tika.detect.EncodingResult; import org.apache.tika.io.TikaInputStream; import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector; import org.apache.tika.metadata.Metadata; -import org.apache.tika.ml.LinearModel; import org.apache.tika.parser.ParseContext; /** - * Diagnostic: raw logits and CharSoup arbitration for Shift-JIS zip entry name bytes. + * Integration tests for charset detection of short byte sequences typical of + * ZIP entry names — a particularly hard case because the probes are tiny (6-23 + * bytes) and structurally valid in several encodings simultaneously. * - * The v2 model (28 classes) removes the UTF-16/32 labels that were confusing the model. - * With v2, Shift-JIS (logit ~10.5) scores clearly above GB18030 (logit ~6.2) for the - * bytes "文章1.txt" in Shift-JIS encoding. + * Detection strategy: Mojibuster ranks candidates by raw logit; CharSoup + * arbitrates using language signal (positive max-logit wins). */ public class ZipFilenameDetectionTest { // 文章1.txt in Shift-JIS (9 raw bytes from a real zip entry) - private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874"); + private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874"); + // 文章2.txt in Shift-JIS (same but '2' instead of '1') + private static final byte[] SJIS_RAW2 = hexToBytes("95b68fcd322e747874"); + // 审计压缩包文件检索测试/ in GBK (23 bytes from gbk.zip) + private static final byte[] GBK_RAW = hexToBytes("c9f3bcc6d1b9cbf5b0fccec4bcfebceccbf7b2e2cad42f"); private static byte[] hexToBytes(String hex) { byte[] b = new byte[hex.length() / 2]; @@ -54,79 +57,21 @@ public class ZipFilenameDetectionTest { return b; } - private boolean isWideUnicode(String label) { - return label.startsWith("UTF-16") || label.startsWith("UTF-32"); - } - - @Test - public void printModelLabels() throws Exception { - LinearModel model = new MojibusterEncodingDetector().getModel(); - String[] labels = model.getLabels(); - System.out.println("Model labels (" + labels.length + "):"); - for (String l : labels) { - System.out.println(" " + l); - } - long wideCount = Arrays.stream(labels).filter(this::isWideUnicode).count(); - System.out.println("Wide-unicode labels in model: " + wideCount - + " (detected natively via stride-2 features)"); - assertTrue(wideCount >= 4, "Model should have UTF-16/32 labels (LE+BE for each)"); - } - - @Test - public void diagnoseLogits() throws Exception { - MojibusterEncodingDetector detector = new MojibusterEncodingDetector(); - LinearModel model = detector.getModel(); - ByteNgramFeatureExtractor extractor = - new ByteNgramFeatureExtractor(model.getNumBuckets()); - String[] labels = model.getLabels(); - - float[] logits = model.predictLogits(extractor.extract(SJIS_RAW)); - - Integer[] idx = new Integer[labels.length]; - for (int i = 0; i < idx.length; i++) { - idx[i] = i; - } - Arrays.sort(idx, (a, b) -> Float.compare(logits[b], logits[a])); - - System.out.printf(Locale.ROOT, "%n=== Raw logits for 文章1.txt (9 bytes) ===%n"); - System.out.printf(Locale.ROOT, "%-24s %8s%n", "charset", "logit"); - System.out.println("-".repeat(35)); - float shiftJisLogit = Float.NEGATIVE_INFINITY; - float gb18030Logit = Float.NEGATIVE_INFINITY; - for (int rank = 0; rank < labels.length; rank++) { - int i = idx[rank]; - boolean cjk = labels[i].contains("JIS") || labels[i].contains("GB") - || labels[i].contains("Big5") || labels[i].contains("EUC"); - if (rank < 6 || cjk) { - System.out.printf(Locale.ROOT, " %-24s %8.2f%n", labels[i], logits[i]); - } - if ("Shift_JIS".equals(labels[i])) { - shiftJisLogit = logits[i]; - } else if ("GB18030".equals(labels[i])) { - gb18030Logit = logits[i]; - } - } - // Verify Shift-JIS ranks ahead of GB18030 on raw (un-tiled) bytes. - // ZipParser no longer tiles short filenames, so this is the actual input. - assertTrue(shiftJisLogit > gb18030Logit, - String.format(Locale.ROOT, - "Shift_JIS logit (%.2f) should beat GB18030 logit (%.2f)", - shiftJisLogit, gb18030Logit)); - } - /** - * Verifies CharSoup correctly picks Shift-JIS when it and GB18030 are both candidates. - * With v2 model, Mojibuster already ranks Shift-JIS above GB18030 (logit ~10.5 vs ~6.2). - * This test uses Shift-JIS as the higher-confidence candidate to reflect that reality. + * CharSoup should confirm Shift-JIS even when Mojibuster ranks Big5-HKSCS first, + * because the language model gives a higher logit to the Japanese text decoded + * from the same bytes. */ @Test - public void charSoupPicksShiftJis() throws Exception { + public void charSoupOverridesModelRankingForShiftJis() throws Exception { + Charset big5 = Charset.forName("Big5-HKSCS"); Charset shiftJis = Charset.forName("Shift_JIS"); - Charset gb18030 = Charset.forName("GB18030"); EncodingDetectorContext ctx = new EncodingDetectorContext(); - ctx.addResult(List.of(new EncodingResult(shiftJis, 0.6f)), "MojibusterEncodingDetector"); - ctx.addResult(List.of(new EncodingResult(gb18030, 0.5f)), "MojibusterEncodingDetector"); + ctx.addResult(List.of( + new EncodingResult(big5, 0.9f, "Big5-HKSCS", EncodingResult.ResultType.STATISTICAL), + new EncodingResult(shiftJis, 0.3f, "Shift_JIS", EncodingResult.ResultType.STATISTICAL) + ), "MojibusterEncodingDetector"); ParseContext parseContext = new ParseContext(); parseContext.set(EncodingDetectorContext.class, ctx); @@ -134,17 +79,46 @@ public class ZipFilenameDetectionTest { CharSoupEncodingDetector charSoup = new CharSoupEncodingDetector(); try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) { List<EncodingResult> result = charSoup.detect(tis, new Metadata(), parseContext); + assertTrue(!result.isEmpty(), "CharSoup should return a result"); + assertEquals(shiftJis, result.get(0).getCharset(), + "CharSoup should pick Shift-JIS (文章) over Big5-HKSCS via language signal"); + } + } - System.out.println("\n=== CharSoup arbitration: Shift-JIS(0.6) vs GB18030(0.5) ==="); - System.out.println("arbitration: " + ctx.getArbitrationInfo()); - if (!result.isEmpty()) { - System.out.printf(Locale.ROOT, "winner: %s (conf=%.4f)%n", - result.get(0).getCharset().name(), result.get(0).getConfidence()); - assertEquals(shiftJis, result.get(0).getCharset(), - "CharSoup should confirm Shift-JIS (文章) over GB18030"); - } else { - System.out.println("result: empty — CharSoup abstained (Mojibuster winner stands)"); + /** + * Full pipeline (BOM → Metadata → Mojibuster → StandardHtml → CharSoup) run + * sequentially on two entries differing only in byte 5 (0x31 vs 0x32), simulating + * what ZipParser does when iterating entries with the same ParseContext. + */ + @Test + public void fullPipelineDetectsBothSjisEntries() throws Exception { + DefaultEncodingDetector detector = new DefaultEncodingDetector(); + Metadata parentMeta = new Metadata(); + ParseContext outerContext = new ParseContext(); + + for (byte[] raw : new byte[][]{SJIS_RAW, SJIS_RAW2}) { + String label = (raw == SJIS_RAW) ? "文章1.txt" : "文章2.txt"; + try (TikaInputStream tis = TikaInputStream.get(raw)) { + List<EncodingResult> results = detector.detect(tis, parentMeta, outerContext); + String charset = results.isEmpty() ? "(empty)" : results.get(0).getCharset().name(); + assertTrue(!results.isEmpty() && "Shift_JIS".equals(results.get(0).getCharset().name()), + label + " should be detected as Shift_JIS, got: " + charset); } } } + + /** + * Full pipeline should detect GBK-encoded entry names as GB18030. + */ + @Test + public void fullPipelineDetectsGbkEntry() throws Exception { + DefaultEncodingDetector detector = new DefaultEncodingDetector(); + Metadata meta = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get(GBK_RAW)) { + List<EncodingResult> results = detector.detect(tis, meta, new ParseContext()); + String charset = results.isEmpty() ? "(empty)" : results.get(0).getCharset().name(); + assertTrue(!results.isEmpty() && results.get(0).getCharset().name().startsWith("GB"), + "GBK entry should be detected as GB18030/GBK, got: " + charset); + } + } } diff --git a/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml b/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml index e461556a93..36f0f8fd53 100644 --- a/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml +++ b/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml @@ -39,6 +39,12 @@ <artifactId>tika-core</artifactId> <version>${revision}</version> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${revision}</version> + <scope>provided</scope> + </dependency> <dependency> <groupId>com.github.albfernandez</groupId> <artifactId>juniversalchardet</artifactId> diff --git a/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java index 86d1780926..4e1c2dc1a1 100644 --- a/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java @@ -31,7 +31,7 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -@TikaComponent(spi = false) +@TikaComponent(spi = false, name = "universal-encoding-detector") public class UniversalEncodingDetector implements EncodingDetector { private static final int BUFSIZE = 1024; diff --git a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java index 75fccbb806..b408ee1841 100644 --- a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java +++ b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java @@ -274,19 +274,13 @@ public class CharSoupLanguageDetector extends LanguageDetector { } /** - * Minimum confidence (inverse logit of the max logit) for a candidate to - * be considered a genuine language match. If no candidate exceeds this - * threshold, the comparison is inconclusive and {@code null} is returned. - * <p> - * 0.88 corresponds to a raw logit of ~2.0. Typical values: - * <ul> - * <li>Arabic (windows-1256): 0.9999994 (logit +14.3)</li> - * <li>UTF-8 garbled: 0.97 (logit +3.5)</li> - * <li>EBCDIC garbage: 0.79 (logit +1.3) — below threshold</li> - * <li>Short English: 0.025 (logit -3.7) — well below threshold</li> - * </ul> + * The language model's max pre-softmax logit must be positive (sigmoid > 0.5) + * for a candidate to be considered a genuine language match. A positive logit + * means the model actively predicts some language as more likely than random; + * a negative logit means the text is too short, too junk-heavy, or too ambiguous + * for any language to stand out. When the best candidate's logit is positive, + * we always return it — the model's relative ordering is the signal we trust. */ - private static final float MIN_CONFIDENCE_THRESHOLD = 0.88f; /** * Maximum ratio of junk characters (U+FFFD replacement chars + C0/C1 @@ -307,24 +301,24 @@ public class CharSoupLanguageDetector extends LanguageDetector { * Compare multiple candidate texts and return the key of the one with * the strongest language signal. Candidates with a high ratio of * replacement or control characters are discarded first. Remaining - * candidates are scored using the inverse logit (sigmoid) of the - * model's maximum pre-softmax logit. + * candidates are scored using the model's maximum pre-softmax logit. * <p> - * Returns {@code null} if no candidate exceeds the minimum confidence - * threshold, indicating the comparison is inconclusive. + * The winning candidate is returned if its max logit is positive (sigmoid > 0.5), + * meaning the model actively predicts some language as more likely than random. + * Returns {@code null} if the map is empty, all candidates are junk, or the + * best candidate's logit is non-positive (model has no real signal). * * @param candidates map of arbitrary keys to candidate text strings * @param <K> key type (e.g., {@link java.nio.charset.Charset}) * @return the key whose text has the strongest language signal, - * or {@code null} if the map is empty or no candidate is - * confident enough + * or {@code null} if no candidate has a positive language signal */ public <K> K compareLanguageSignal(Map<K, String> candidates) { if (candidates.isEmpty()) { return null; } - float bestConfidence = Float.NEGATIVE_INFINITY; + float bestMaxLogit = Float.NEGATIVE_INFINITY; K bestKey = null; for (Map.Entry<K, String> entry : candidates.entrySet()) { @@ -337,24 +331,22 @@ public class CharSoupLanguageDetector extends LanguageDetector { int[] features = EXTRACTOR.extract(entry.getValue()); float[] logits = MODEL.predictLogits(features); - float confidence = sigmoid(max(logits)); + float maxLogit = max(logits); - LOG.debug("compareLanguageSignal: {} -> confidence={}", - entry.getKey(), confidence); + LOG.debug("compareLanguageSignal: {} -> maxLogit={}", entry.getKey(), maxLogit); - if (confidence > bestConfidence) { - bestConfidence = confidence; + if (maxLogit > bestMaxLogit) { + bestMaxLogit = maxLogit; bestKey = entry.getKey(); } } - if (bestConfidence < MIN_CONFIDENCE_THRESHOLD) { - LOG.debug("compareLanguageSignal: inconclusive (bestConfidence={} < {})", - bestConfidence, MIN_CONFIDENCE_THRESHOLD); - return null; + if (bestKey != null && bestMaxLogit > 0) { + return bestKey; } - return bestKey; + LOG.debug("compareLanguageSignal: inconclusive (bestMaxLogit={})", bestMaxLogit); + return null; } /** @@ -362,7 +354,7 @@ public class CharSoupLanguageDetector extends LanguageDetector { * with the highest logit, its raw logit value, and sigmoid(maxLogit). * Package-private for testing. */ - static float[] maxLogitInfo(String text) { + public static float[] maxLogitInfo(String text) { int[] features = EXTRACTOR.extract(text); float[] logits = MODEL.predictLogits(features); int bestIdx = 0; @@ -375,7 +367,7 @@ public class CharSoupLanguageDetector extends LanguageDetector { } /** Returns the label for a class index (for use alongside {@link #maxLogitInfo}). */ - static String labelAt(int idx) { + public static String labelAt(int idx) { return MODEL.getLabel(idx); } @@ -383,6 +375,24 @@ public class CharSoupLanguageDetector extends LanguageDetector { * Ratio of junk characters (U+FFFD replacement + ISO control + C1 * control range U+0080-U+009F) to total characters. High values * indicate a wrong-charset decoding. + * <p> + * TODO: consider also counting non-ASCII, non-alphabetic, non-digit characters + * (e.g. bullet U+2022, pilcrow U+00B6) as fractional junk (weight ~0.3). + * Single-byte encodings like windows-1256 assign punctuation/symbols to byte + * positions like 0x95 and 0xB6 that multi-byte encodings (Shift_JIS, GB18030) + * use as lead bytes for alphabetic characters. When those bytes appear in text + * that should be meaningful (e.g. filenames), the single-byte interpretation + * "wastes" bytes on punctuation while the multi-byte interpretation yields 100% + * alphabetic content. Counting such punctuation as partial junk would lower the + * MAX_JUNK_RATIO gate for those candidates and pass the decision to the language + * model sooner. Needs careful tuning: legitimate body text can intentionally + * contain bullet lists, em-dashes, etc. in windows-125x encodings. + * Counter-example: Chinese GB18030 bytes decoded as UTF-16 produce pairs + * interpreted as Unicode code points — many of which happen to be alphabetic + * (Unicode has alphabetic characters scattered throughout the range), so + * alphabetic yield would look high even for complete mojibake. The language + * model already handles this correctly; the alphabetic density heuristic alone + * would not. */ static float junkRatio(String text) { if (text == null || text.isEmpty()) { @@ -394,8 +404,17 @@ public class CharSoupLanguageDetector extends LanguageDetector { int cp = text.codePointAt(i); i += Character.charCount(cp); total++; - if (cp == 0xFFFD || Character.isISOControl(cp)) { + // U+FFFD = replacement char (wrong-charset decode) + // C1 control range 0x80-0x9F = garbage when a byte is decoded as ISO-8859-1 + // (those code points are never produced by correct Windows-125x decoding). + // Ordinary whitespace (tab, LF, CR, FF, VT) is not junk — it appears in + // source code and structured documents regardless of charset. + if (cp == 0xFFFD) { junk++; + } else if (Character.isISOControl(cp)) { + if (cp != 0x09 && cp != 0x0A && cp != 0x0B && cp != 0x0C && cp != 0x0D) { + junk++; + } } } return total == 0 ? 0f : (float) junk / total; diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java index 80e71a9bdf..949bd0bc05 100644 --- a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java +++ b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java @@ -46,17 +46,19 @@ public class ByteNgramFeatureExtractorTest { } @Test - public void testAsciiOnlyProducesNoFeatures() { + public void testAsciiOnlyProducesStride2Features() { ByteNgramFeatureExtractor ext = new ByteNgramFeatureExtractor(NUM_BUCKETS); - // All bytes < 0x80 are skipped — HTML tags, ASCII text, etc. produce nothing + // Stride-1 skips bytes < 0x80, but stride-2 covers ALL bytes (needed for UTF-16/32 + // null-byte detection). "hello world" (11 bytes) → 5 stride-2 pairs at positions + // 0,2,4,6,8 → 5 features total. byte[] ascii = "hello world".getBytes(java.nio.charset.StandardCharsets.US_ASCII); - assertEquals(0, sum(ext.extract(ascii))); + assertEquals(5, sum(ext.extract(ascii))); } @Test public void testSingleHighByteProducesOneUnigram() { ByteNgramFeatureExtractor ext = new ByteNgramFeatureExtractor(NUM_BUCKETS); - // One high byte → one unigram, no bigram (no following byte) + // One high byte, no following byte → 1 stride-1 unigram; no stride-2 pair int[] counts = ext.extract(new byte[]{(byte) 0xE0}); assertEquals(1, sum(counts)); } @@ -64,25 +66,31 @@ public class ByteNgramFeatureExtractorTest { @Test public void testTwoHighBytesProduceUnigramAndBigram() { ByteNgramFeatureExtractor ext = new ByteNgramFeatureExtractor(NUM_BUCKETS); - // 0xE0 → unigram; (0xE0, 0xE1) → bigram; 0xE1 → unigram = 3 features + // Stride-1: unigram(0xE0) + bigram(0xE0,0xE1) + unigram(0xE1) = 3 + // Stride-2: pair(0xE0,0xE1) at position 0 = 1 + // Total = 4 int[] counts = ext.extract(new byte[]{(byte) 0xE0, (byte) 0xE1}); - assertEquals(3, sum(counts)); + assertEquals(4, sum(counts)); } @Test - public void testHighByteFollowedByAsciiProducesUnigramAndBigram() { + public void testHighByteFollowedByAsciiProducesUnigramBigramAndAnchoredBigram() { ByteNgramFeatureExtractor ext = new ByteNgramFeatureExtractor(NUM_BUCKETS); - // 0xE0 → unigram; (0xE0, 0x41) → bigram; 0x41 is ASCII so no further features = 2 + // Stride-1: unigram(0xE0) + bigram(0xE0,0x41) + anchored_bigram(0x41,end) = 3 + // Stride-2: pair(0xE0,0x41) at position 0 = 1 + // Total = 4 int[] counts = ext.extract(new byte[]{(byte) 0xE0, 0x41}); - assertEquals(2, sum(counts)); + assertEquals(4, sum(counts)); } @Test - public void testAsciiFollowedByHighByteProducesUnigramAndBigram() { + public void testAsciiFollowedByHighByteProducesUnigramAndStride2() { ByteNgramFeatureExtractor ext = new ByteNgramFeatureExtractor(NUM_BUCKETS); - // 0x41 skipped; 0xE0 → unigram; no following byte → 1 feature + // Stride-1: 0x41 skipped; unigram(0xE0), no following byte = 1 + // Stride-2: pair(0x41,0xE0) at position 0 = 1 + // Total = 2 int[] counts = ext.extract(new byte[]{0x41, (byte) 0xE0}); - assertEquals(1, sum(counts)); + assertEquals(2, sum(counts)); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 493d47d5a9..b01dedded4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -1004,8 +1004,8 @@ public class HtmlParserTest extends TikaTest { } assertEquals("text/html; charset=UTF-ELEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); - // "UTF-ELEVEN" is not a valid charset; ML detection returns UTF-8 for ASCII content. - assertEquals("text/html; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + // "UTF-ELEVEN" is not a valid charset; no declaration available, ML defaults to windows-1252. + assertEquals("text/html; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); test = "<html><head><meta http-equiv=\"content-type\" content=\"application/pdf\">" + "</head><title>title</title><body>body</body></html>"; @@ -1017,8 +1017,8 @@ public class HtmlParserTest extends TikaTest { metadata, new ParseContext()); } assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); - // No valid charset declaration; ML detection returns UTF-8 for ASCII content. - assertEquals("text/html; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + // No valid charset declaration; ML defaults to windows-1252 for pure ASCII content. + assertEquals("text/html; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); //test two content values test = @@ -1033,8 +1033,8 @@ public class HtmlParserTest extends TikaTest { metadata, new ParseContext()); } assertEquals("application/pdf", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); - // No valid charset declaration; ML detection returns UTF-8 for ASCII content. - assertEquals("text/html; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + // No valid charset declaration; ML defaults to windows-1252 for pure ASCII content. + assertEquals("text/html; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); } @Test @@ -1074,8 +1074,8 @@ public class HtmlParserTest extends TikaTest { assertEquals("text/html; charset=iso-NUMBER_SEVEN", metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT)); - // "iso-NUMBER_SEVEN" is not a valid charset; ML detection returns UTF-8 for ASCII content. - assertEquals("application/xhtml+xml; charset=UTF-8", + // "iso-NUMBER_SEVEN" is not a valid charset; ML defaults to windows-1252 for pure ASCII. + assertEquals("application/xhtml+xml; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java index b0080ee761..883899fa14 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java @@ -186,7 +186,8 @@ public class POIContainerExtractionTest extends AbstractPOIContainerExtractionTe expected.add("application/vnd.openxmlformats-officedocument.presentationml.presentation"); expected.add("application/pdf"); expected.add("application/xml"); - expected.add("text/plain; charset=ISO-8859-1"); + // CRLF line endings in this embedded text file trigger the ISO→Windows upgrade heuristic + expected.add("text/plain; charset=windows-1252"); //test that we're correctly handling attachment variants for // files created by WPS 表格 (https://www.wps.cn/) for (String suffix : new String[]{ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java index 0f33218e76..62ec3bc8e3 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java @@ -109,6 +109,7 @@ public class ZipParser extends AbstractArchiveParser { */ private static final int MAX_INTEGRITY_CHECK_ENTRIES = 100; + private final ZipParserConfig defaultConfig; private static Set<MediaType> loadZipSpecializations() { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml index 227ebd31cb..bcc316d361 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml @@ -34,13 +34,11 @@ <groupId>org.apache.tika</groupId> <artifactId>tika-encoding-detector-mojibuster</artifactId> <version>${project.version}</version> - <scope>test</scope> </dependency> <dependency> <groupId>org.apache.tika</groupId> <artifactId>tika-encoding-detector-charsoup</artifactId> <version>${project.version}</version> - <scope>test</scope> </dependency> <dependency> <groupId>commons-codec</groupId> @@ -50,6 +48,18 @@ <groupId>org.apache.commons</groupId> <artifactId>commons-csv</artifactId> </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-encoding-detector-icu4j</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-encoding-detector-universal</artifactId> + <version>${project.version}</version> + <scope>test</scope> + </dependency> <dependency> <groupId>com.google.guava</groupId> <artifactId>guava</artifactId> diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java index 67358c5403..a32d063223 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java @@ -101,7 +101,7 @@ public class TextAndCSVParserTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv"); XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER, metadata); assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); - assertMediaTypeEquals("csv", "UTF-8", "comma", + assertMediaTypeEquals("csv", "windows-1252", "comma", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml); assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS)); @@ -126,7 +126,7 @@ public class TextAndCSVParserTest extends TikaTest { metadata.set(Metadata.CONTENT_TYPE, "text/csv"); XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER, metadata); assertEquals("comma", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); - assertMediaTypeEquals("csv", "UTF-8", "comma", + assertMediaTypeEquals("csv", "windows-1252", "comma", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml); } @@ -160,7 +160,7 @@ public class TextAndCSVParserTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv"); XMLResult xmlResult = getXML(TikaInputStream.get(TSV_UTF8), PARSER, metadata); assertEquals("tab", xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); - assertMediaTypeEquals("tsv", "UTF-8", "tab", + assertMediaTypeEquals("tsv", "windows-1252", "tab", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml); } @@ -191,7 +191,7 @@ public class TextAndCSVParserTest extends TikaTest { metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv"); XMLResult xmlResult = getXML(TikaInputStream.get(csv), PARSER, metadata); assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY)); - assertEquals("text/plain; charset=UTF-8", + assertEquals("text/plain; charset=windows-1252", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); assertContains("the,quick", xmlResult.xml); } @@ -225,7 +225,7 @@ public class TextAndCSVParserTest extends TikaTest { XMLResult xmlResult = getXML(TikaInputStream.get(sb.toString().getBytes(StandardCharsets.UTF_8)), PARSER, metadata); - assertMediaTypeEquals("csv", "UTF-8", "comma", + assertMediaTypeEquals("csv", "windows-1252", "comma", xmlResult.metadata.get(Metadata.CONTENT_TYPE)); } @@ -233,8 +233,7 @@ public class TextAndCSVParserTest extends TikaTest { @Test public void testSubclassingMimeTypesRemain() throws Exception { XMLResult r = getXML("testVCalendar.vcs"); - // Pure ASCII content — correctly detected as UTF-8 - assertEquals("text/x-vcalendar; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/x-vcalendar; charset=windows-1252", r.metadata.get(Metadata.CONTENT_TYPE)); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java index a29fb299f1..b02236a4e5 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java @@ -29,9 +29,11 @@ import org.xml.sax.ContentHandler; import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; +import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -54,8 +56,8 @@ public class TXTParserTest extends TikaTest { } String content = writer.toString(); - // Pure ASCII — correctly detected as UTF-8 (ASCII is a subset of UTF-8) - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + // Pure ASCII — detected as windows-1252 (the HTML5/WHATWG default for 8-bit Western) + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); // TIKA-501: Remove language detection from TXTParser assertNull(metadata.get(Metadata.CONTENT_LANGUAGE)); @@ -89,7 +91,7 @@ public class TXTParserTest extends TikaTest { try (TikaInputStream tis = TikaInputStream.get(new byte[0])) { parser.parse(tis, handler, metadata, new ParseContext()); } - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); assertEquals("\n", handler.toString()); } @@ -102,9 +104,9 @@ public class TXTParserTest extends TikaTest { */ @Test public void testLatinDetectionHeuristics() throws Exception { - // These were previously testing CR/LF heuristics specific to UniversalEncodingDetector. - // The ML-based detector (MojibusterEncodingDetector + CharSoup) correctly identifies - // pure-ASCII content as UTF-8 and does not rely on line-ending heuristics. + // Previously tested CR/LF heuristics specific to UniversalEncodingDetector. + // The ML-based detector defaults to windows-1252 for pure ASCII regardless of + // line endings (CRLF_TO_WINDOWS is a secondary confirmation, not the primary path). String windows = "test\r\n"; String unix = "test\n"; String euro = "test \u20ac\n"; @@ -115,15 +117,13 @@ public class TXTParserTest extends TikaTest { try (TikaInputStream tis = TikaInputStream.get(windows.getBytes("ISO-8859-15"))) { parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } - // Pure ASCII — UTF-8 is correct - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(unix.getBytes("ISO-8859-15"))) { parser.parse(tis, new DefaultHandler(), metadata, new ParseContext()); } - // Pure ASCII — UTF-8 is correct - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); metadata = new Metadata(); try (TikaInputStream tis = TikaInputStream.get(euro.getBytes("ISO-8859-15"))) { @@ -247,8 +247,7 @@ public class TXTParserTest extends TikaTest { parser.parse(tis, new WriteOutContentHandler(writer), metadata, new ParseContext()); } - // Pure ASCII — UTF-8 is correct - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); } /** @@ -264,11 +263,10 @@ public class TXTParserTest extends TikaTest { try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); } - // Pure ASCII — UTF-8 is correct - assertEquals("text/plain; charset=UTF-8", metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadata.get(Metadata.CONTENT_TYPE)); - // Now verify that if we tell the parser the encoding is UTF-8, that's what - // we get back (see TIKA-868) + // TIKA-868: MetadataCharsetDetector (tika-core) reads the charset from Content-Type + // and returns it as DECLARATIVE, which CharSoup prefers over the statistical windows-1252. metadata.set(Metadata.CONTENT_TYPE, "application/binary; charset=UTF-8"); try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) { parser.parse(tis, new BodyContentHandler(), metadata, new ParseContext()); @@ -280,8 +278,23 @@ public class TXTParserTest extends TikaTest { @Test public void testSubclassingMimeTypesRemain() throws Exception { XMLResult r = getXML("testVCalendar.vcs"); - // Pure ASCII content — correctly detected as UTF-8 - assertEquals("text/x-vcalendar; charset=UTF-8", r.metadata.get(Metadata.CONTENT_TYPE)); + assertEquals("text/x-vcalendar; charset=windows-1252", r.metadata.get(Metadata.CONTENT_TYPE)); + } + + // TIKA-3516, TIKA-3525, TIKA-1236 + @Test + public void testIgnoreCharset() throws Exception { + AutoDetectParser parser = (AutoDetectParser) TikaLoader.load( + getConfigPath(TXTParserTest.class, "tika-config-ignore-charset.json")) + .loadAutoDetectParser(); + + Metadata m = new Metadata(); + m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt"); + assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt", parser, m).xml); + + m = new Metadata(); + m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt"); + assertContains("Please check your email", getXML("test_ignore_IBM420.html", parser, m).xml); } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 73928000bb..31421a12c9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -32,12 +32,15 @@ import org.junit.jupiter.api.Test; import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.detect.BOMDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.detect.MetaEncodingDetector; +import org.apache.tika.detect.MetadataCharsetDetector; import org.apache.tika.detect.OverrideEncodingDetector; import org.apache.tika.exception.TikaConfigException; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.ml.chardetect.MojibusterEncodingDetector; @@ -56,13 +59,15 @@ public class TikaEncodingDetectorTest extends TikaTest { EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 2 base detectors (ML, StandardHtml) + CharSoupEncodingDetector (MetaEncodingDetector) - assertEquals(3, detectors.size()); + // 4 base detectors (BOM, Metadata, ML, StandardHtml) + CharSoupEncodingDetector (MetaEncodingDetector) + assertEquals(5, detectors.size()); // meta detector is always last (partitioned by CompositeEncodingDetector) - assertTrue(detectors.get(2) instanceof MetaEncodingDetector); + assertTrue(detectors.get(4) instanceof MetaEncodingDetector); // base detectors — sorted by full class name; check by type - Set<Class<?>> baseClasses = detectors.subList(0, 2).stream() + Set<Class<?>> baseClasses = detectors.subList(0, 4).stream() .map(Object::getClass).collect(Collectors.toSet()); + assertTrue(baseClasses.contains(BOMDetector.class)); + assertTrue(baseClasses.contains(MetadataCharsetDetector.class)); assertTrue(baseClasses.contains(MojibusterEncodingDetector.class)); assertTrue(baseClasses.contains(StandardHtmlEncodingDetector.class)); } @@ -81,12 +86,14 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(detector1 instanceof CompositeEncodingDetector); List<EncodingDetector> detectors1Children = ((CompositeEncodingDetector) detector1).getDetectors(); - // ML base detector + CharSoup meta (html excluded) - assertEquals(2, detectors1Children.size()); - Set<Class<?>> innerClasses = detectors1Children.subList(0, 1).stream() + // BOM + Metadata + ML base detectors + CharSoup meta (html excluded) + assertEquals(4, detectors1Children.size()); + Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream() .map(Object::getClass).collect(Collectors.toSet()); + assertTrue(innerClasses.contains(BOMDetector.class)); + assertTrue(innerClasses.contains(MetadataCharsetDetector.class)); assertTrue(innerClasses.contains(MojibusterEncodingDetector.class)); - assertTrue(detectors1Children.get(1) instanceof MetaEncodingDetector); + assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector); assertTrue(detectors.get(1) instanceof OverrideEncodingDetector); @@ -178,9 +185,9 @@ public class TikaEncodingDetectorTest extends TikaTest { ((AbstractEncodingDetectorParser) encodingDetectingParser) .getEncodingDetector(); assertTrue(encodingDetector instanceof CompositeEncodingDetector); - // ML, Html base detectors + CharSoup MetaEncodingDetector + // BOM, Metadata, ML, Html base detectors + CharSoup MetaEncodingDetector // (ICU4J is excluded but was already not in the default chain) - assertEquals(3, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); + assertEquals(5, ((CompositeEncodingDetector) encodingDetector).getDetectors().size()); for (EncodingDetector child : ((CompositeEncodingDetector) encodingDetector) .getDetectors()) { assertNotContained("cu4j", child.getClass().getCanonicalName()); @@ -207,14 +214,17 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(encodingDetector instanceof CompositeEncodingDetector); List<EncodingDetector> children = ((CompositeEncodingDetector) encodingDetector).getDetectors(); - assertEquals(3, children.size(), childParser.getClass().toString()); + // 3 base detectors + 1 MetaEncodingDetector (CharSoup) = 4 total + assertEquals(4, children.size(), childParser.getClass().toString()); assertTrue(children.get(0) instanceof MojibusterEncodingDetector, childParser.getClass().toString()); HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) children.get(1); - assertEquals(64000, htmlDet.getDefaultConfig().getMarkLimit(), + assertEquals(100000, htmlDet.getDefaultConfig().getMarkLimit(), childParser.getClass().toString()); assertTrue(children.get(2) instanceof StandardHtmlEncodingDetector, childParser.getClass().toString()); + assertTrue(children.get(3) instanceof CharSoupEncodingDetector, + childParser.getClass().toString()); } } @@ -222,7 +232,8 @@ public class TikaEncodingDetectorTest extends TikaTest { public void testMarkLimitIntegration() throws Exception { StringBuilder sb = new StringBuilder(); sb.append("<html><head><script>"); - for (int i = 0; i < 4000; i++) { //script length = 20000 + // script length = ~80000 bytes, beyond the default mark limit of 65536 + for (int i = 0; i < 16000; i++) { sb.append("blah "); } sb.append("</script>"); @@ -233,19 +244,23 @@ public class TikaEncodingDetectorTest extends TikaTest { byte[] bytes = sb.toString().getBytes(StandardCharsets.UTF_8); - // The new pipeline (StandardHtmlEncodingDetector reads past the script block - // and finds the meta charset) correctly detects UTF-8 even by default. + // Default: the meta charset is buried at ~byte 80,000, past the default + // mark limit of 65536. The detector falls back to windows-1252 for the + // pure-ASCII probe. HTML entities (ø) render correctly regardless; + // raw UTF-8 multibyte sequences (e.g. ø in "økologisk") are garbled. + // Raise the mark limit via config to fix this (see below). Parser p = AUTO_DETECT_PARSER; Metadata metadata = new Metadata(); String xml = getXML(TikaInputStream.get(bytes), p, metadata).xml; - assertContains("gr\u00F8nd", xml); - assertContains("\u00f8kologisk", xml); - assertContains("gr\u00F8nt", xml); - assertContains("g\u00E5 til", xml); + assertContains("gr\u00F8nd", xml); // ø entity — correct regardless + assertNotContained("\u00f8kologisk", xml); // raw UTF-8 bytes — garbled by default + assertNotContained("gr\u00F8nt", xml); + assertNotContained("g\u00E5 til", xml); - //now test that fix works + // With a raised mark limit the detector reaches the meta charset and + // correctly decodes UTF-8 content. p = TikaLoaderHelper.getLoader("TIKA-2485-encoding-detector-mark-limits.json").loadAutoDetectParser(); metadata = new Metadata(); @@ -266,10 +281,12 @@ public class TikaEncodingDetectorTest extends TikaTest { assertTrue(detector instanceof CompositeEncodingDetector); List<EncodingDetector> detectors = ((CompositeEncodingDetector) detector).getDetectors(); - // 2 base detectors (ML + StandardHtml), no MetaEncodingDetector - assertEquals(2, detectors.size()); + // 4 base detectors (BOM + Metadata + ML + StandardHtml), no MetaEncodingDetector + assertEquals(4, detectors.size()); Set<Class<?>> excludedCharSoupClasses = detectors.stream() .map(Object::getClass).collect(Collectors.toSet()); + assertTrue(excludedCharSoupClasses.contains(BOMDetector.class)); + assertTrue(excludedCharSoupClasses.contains(MetadataCharsetDetector.class)); assertTrue(excludedCharSoupClasses.contains(MojibusterEncodingDetector.class)); assertTrue(excludedCharSoupClasses.contains(StandardHtmlEncodingDetector.class)); for (EncodingDetector d : detectors) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index 3e05518ebc..8f9b957e90 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -55,7 +55,7 @@ public class AutoDetectParserTest extends TikaTest { // Easy to read constants for the MIME types: private static final String RAW = "application/octet-stream"; private static final String EXCEL = "application/vnd.ms-excel"; - private static final String HTML = "text/html; charset=UTF-8"; + private static final String HTML = "text/html; charset=windows-1252"; private static final String PDF = "application/pdf"; private static final String POWERPOINT = "application/vnd.ms-powerpoint"; private static final String KEYNOTE = "application/vnd.apple.keynote"; @@ -63,7 +63,7 @@ public class AutoDetectParserTest extends TikaTest { private static final String NUMBERS = "application/vnd.apple.numbers"; private static final String CHM = "application/vnd.ms-htmlhelp"; private static final String RTF = "application/rtf"; - private static final String PLAINTEXT = "text/plain; charset=UTF-8"; + private static final String PLAINTEXT = "text/plain; charset=windows-1252"; private static final String UTF8TEXT = "text/plain; charset=UTF-8"; private static final String WORD = "application/msword"; private static final String XML = "application/xml"; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java index 1411481f1c..7bd45cf813 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java @@ -46,11 +46,11 @@ public class RTFParserTest extends TikaTest { public void testEmbeddedMonster() throws Exception { Map<Integer, Pair> expected = new HashMap<>(); - expected.put(3, new Pair("Hw.txt", "text/plain; charset=UTF-8")); + expected.put(3, new Pair("Hw.txt", "text/plain; charset=windows-1252")); expected.put(4, new Pair("file_0.doc", "application/msword")); expected.put(7, new Pair("file_1.xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")); - expected.put(10, new Pair("text.html", "text/html; charset=UTF-8")); + expected.put(10, new Pair("text.html", "text/html; charset=windows-1252")); expected.put(11, new Pair("html-within-zip.zip", "application/zip")); expected.put(12, new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", "application/zip")); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index b7565f11f2..76910c56ff 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -252,7 +252,7 @@ public class PDFParserTest extends TikaTest { metadatas.get(1).get(Metadata.CONTENT_TYPE)); assertImageContentType("image/tiff", metadatas.get(2).get(Metadata.CONTENT_TYPE)); - assertEquals("text/plain; charset=UTF-8", metadatas.get(3).get(Metadata.CONTENT_TYPE)); + assertEquals("text/plain; charset=windows-1252", metadatas.get(3).get(Metadata.CONTENT_TYPE)); assertEquals(TYPE_DOC.toString(), metadatas.get(4).get(Metadata.CONTENT_TYPE)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java index 785206f8da..bec188b8d8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java @@ -18,7 +18,6 @@ package org.apache.tika.parser.pkg; import java.util.List; -import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.apache.tika.TikaTest; @@ -33,7 +32,6 @@ public class PackageParserTest extends TikaTest { assertContains("审计压缩", metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); } - @Disabled("TIKA-4662: ML model confuses Shift_JIS with Big5 on 9-byte zip entry name probes; needs model improvement") @Test public void handleEntryNameWithCharsetShiftJIS() throws Exception { List<Metadata> metadataList = getRecursiveMetadata("testZipEntryNameCharsetShiftSJIS.zip"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json index 6da5365b70..8275da4bfc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json @@ -5,11 +5,16 @@ }, { "html-encoding-detector": { - "markLimit": 64000 + "markLimit": 100000 } }, { - "standard-html-encoding-detector": {} + "standard-html-encoding-detector": { + "markLimit": 100000 + } + }, + { + "charsoup-encoding-detector": {} } ] }
