(tika) 09/09: chardet -

tallison Thu, 05 Mar 2026 15:13:05 -0800

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch chardet-work
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 534c72f36938c5be548c0b7386011ac1c58dc786
Author: tballison <[email protected]>
AuthorDate: Thu Mar 5 18:12:35 2026 -0500

    chardet -
---
 .../java/org/apache/tika/detect/BOMDetector.java   |  18 +-
 .../tika/detect/CompositeEncodingDetector.java     |   3 +-
 .../org/apache/tika/detect/EncodingDetector.java   |  11 +-
 .../tika/detect/EncodingDetectorContext.java       |   7 +
 .../org/apache/tika/detect/EncodingResult.java     |  92 +++++++--
 .../tika/detect/MetadataCharsetDetector.java       | 132 ++++++++++++
 .../tika/detect/OverrideEncodingDetector.java      |   3 +-
 .../org.apache.tika.detect.EncodingDetector        |  10 +
 .../charsoup/CharSoupEncodingDetector.java         |  66 +++---
 .../charsoup/CharSoupEncodingDetectorTest.java     |   8 +-
 .../tika/parser/html/HtmlEncodingDetector.java     |   3 +-
 .../StandardHtmlEncodingDetector.java              | 105 ++++++----
 .../tika-encoding-detector-icu4j/pom.xml           |  12 ++
 .../tika/parser/txt/CharsetDetectorTest.java       |   1 +
 .../configs/tika-config-ignore-charset.json        |  13 ++
 .../resources/test-documents/multi-language.txt    |  58 ++++++
 .../src/test/resources/test-documents/resume.html  |  99 +++++++++
 .../resources/test-documents/testIgnoreCharset.txt |   4 +
 .../resources/test-documents/testTXT_win-1252.txt  |   1 +
 .../test-documents/test_ignore_IBM420.html         | Bin 0 -> 1869 bytes
 .../ml/chardetect/MojibusterEncodingDetector.java  | 223 ++++++++++++++++-----
 .../ml/chardetect/StructuralEncodingRules.java     |  29 +++
 .../tika/ml/chardetect/chardetect-ebcdic.bin       | Bin 5232 -> 7312 bytes
 .../org/apache/tika/ml/chardetect/chardetect.bin   | Bin 508522 -> 410106 bytes
 .../tika/ml/chardetect/EbcdicRoutingTest.java      | 132 ++++++++++++
 .../ml/chardetect/ZipFilenameDetectionTest.java    | 142 ++++++-------
 .../tika-encoding-detector-universal/pom.xml       |   6 +
 .../tika/parser/txt/UniversalEncodingDetector.java |   2 +-
 .../charsoup/CharSoupLanguageDetector.java         |  83 +++++---
 .../chardetect/ByteNgramFeatureExtractorTest.java  |  32 +--
 .../apache/tika/parser/html/HtmlParserTest.java    |  16 +-
 .../microsoft/POIContainerExtractionTest.java      |   3 +-
 .../java/org/apache/tika/parser/pkg/ZipParser.java |   1 +
 .../tika-parser-text-module/pom.xml                |  14 +-
 .../tika/parser/csv/TextAndCSVParserTest.java      |  13 +-
 .../org/apache/tika/parser/txt/TXTParserTest.java  |  49 +++--
 .../tika/config/TikaEncodingDetectorTest.java      |  61 ++++--
 .../apache/tika/parser/AutoDetectParserTest.java   |   4 +-
 .../tika/parser/microsoft/rtf/RTFParserTest.java   |   4 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   2 +-
 .../apache/tika/parser/pkg/PackageParserTest.java  |   2 -
 .../TIKA-2485-encoding-detector-mark-limits.json   |   9 +-
 42 files changed, 1122 insertions(+), 351 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
index 322e307187..21e9ca08a7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/BOMDetector.java
@@ -32,16 +32,18 @@ import org.apache.tika.parser.ParseContext;
 
 /**
  * Encoding detector that identifies the character set from a byte-order mark
- * (BOM) at the start of the stream.  Returns a single result with confidence
- * {@link EncodingResult#CONFIDENCE_DEFINITIVE} when a BOM is found.
+ * (BOM) at the start of the stream.  Returns a single {@link 
EncodingResult.ResultType#DECLARATIVE}
+ * result when a BOM is found — a BOM is an explicit in-band declaration of 
encoding
+ * and takes priority over all statistical or structural inference.
  *
- * <p>Not SPI-loaded by default — add explicitly to your encoding-detector
- * chain when needed.  UTF-16/32 content without a BOM is detected by
- * {@code MojibusterEncodingDetector} via stride-2 byte n-gram features.</p>
+ * <p>SPI-loaded first in the default encoding-detector chain so that BOM 
evidence
+ * reaches {@code CharSoupEncodingDetector} before any statistical detector 
runs.
+ * {@code MojibusterEncodingDetector} strips the BOM from its own probe 
independently
+ * to ensure consistent model inference (BOMs are excluded from training 
data).</p>
  *
  * @since Apache Tika 0.x (moved to org.apache.tika.detect in 4.0)
  */
-@TikaComponent(spi = false)
+@TikaComponent
 public class BOMDetector implements EncodingDetector {
 
     private static final ByteOrderMark[] BOMS =
@@ -88,8 +90,8 @@ public class BOMDetector implements EncodingDetector {
         for (int i = 0; i < BOMS.length; i++) {
             ByteOrderMark bom = BOMS[i];
             if (startsWith(bom, bytes) && CHARSETS[i] != null) {
-                return List.of(new EncodingResult(CHARSETS[i],
-                        EncodingResult.CONFIDENCE_DEFINITIVE));
+                return List.of(new EncodingResult(CHARSETS[i], 1.0f,
+                        CHARSETS[i].name(), 
EncodingResult.ResultType.DECLARATIVE));
             }
         }
         return Collections.emptyList();
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
index 10285ba44a..fc8b0ab038 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/CompositeEncodingDetector.java
@@ -194,7 +194,8 @@ public class CompositeEncodingDetector implements 
EncodingDetector, Serializable
                 sb.append(", ");
             }
             
sb.append(r.getDetectorName()).append("->").append(r.getCharset().name());
-            if (r.getConfidence() < EncodingResult.CONFIDENCE_DEFINITIVE) {
+            sb.append("[").append(r.getResultType()).append("]");
+            if (r.getResultType() == EncodingResult.ResultType.STATISTICAL) {
                 sb.append(String.format(java.util.Locale.ROOT, "(%.2f)", 
r.getConfidence()));
             }
         }
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
index ff02a0bcb5..7522003c76 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetector.java
@@ -31,11 +31,12 @@ import org.apache.tika.parser.ParseContext;
  * metadata or the first few bytes of the document stream.
  *
  * <p>Detectors return a ranked list of {@link EncodingResult}s in descending
- * confidence order. An empty list means no opinion. A single result with
- * confidence {@link EncodingResult#CONFIDENCE_DEFINITIVE} (1.0) indicates a
- * structural detection that requires no further arbitration. Multiple results
- * or lower confidence values invite arbitration by a
- * {@link MetaEncodingDetector}.</p>
+ * confidence order. An empty list means no opinion. Results carry a
+ * {@link EncodingResult.ResultType} indicating the nature of the evidence:
+ * {@code DECLARATIVE} (BOM, HTML meta charset), {@code STRUCTURAL} 
(byte-grammar
+ * proof), or {@code STATISTICAL} (probabilistic model). A
+ * {@link MetaEncodingDetector} uses these types to arbitrate when detectors
+ * disagree.</p>
  *
  * @since Apache Tika 0.4
  */
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
index 81d870e599..6957601e2c 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingDetectorContext.java
@@ -144,6 +144,13 @@ public class EncodingDetectorContext {
             return encodingResults.get(0).getConfidence();
         }
 
+        /**
+         * The {@link EncodingResult.ResultType} of the top-ranked result from 
this detector.
+         */
+        public EncodingResult.ResultType getResultType() {
+            return encodingResults.get(0).getResultType();
+        }
+
         public String getDetectorName() {
             return detectorName;
         }
diff --git a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java 
b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
index 135e81240c..55724aefc7 100644
--- a/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
+++ b/tika-core/src/main/java/org/apache/tika/detect/EncodingResult.java
@@ -19,20 +19,50 @@ package org.apache.tika.detect;
 import java.nio.charset.Charset;
 
 /**
- * A charset detection result pairing a {@link Charset} with a confidence 
score.
+ * A charset detection result pairing a {@link Charset} with a confidence score
+ * and a {@link ResultType} indicating the nature of the evidence.
  *
- * <p>Confidence is in the range {@code [0.0, 1.0]}. A score of {@code 1.0}
- * indicates a definitive structural detection (e.g. UTF-16/32 from null-byte
- * patterns, or a declared {@code charset} attribute in an HTML meta tag) that
- * requires no further arbitration. Lower scores reflect statistical estimates
- * where arbitration by a {@link MetaEncodingDetector} may improve 
accuracy.</p>
+ * <h3>Result types</h3>
+ * <ul>
+ *   <li>{@link ResultType#DECLARATIVE} — the document explicitly stated its
+ *       encoding (BOM, HTML {@code <meta charset>}).  These are authoritative
+ *       claims about author intent and get preference over inferred results
+ *       <em>when consistent with the actual bytes</em>.</li>
+ *   <li>{@link ResultType#STRUCTURAL} — byte-grammar proof (ISO-2022 escape
+ *       sequences, UTF-8 multibyte validation).  The encoding is proven by the
+ *       byte structure itself, independent of any declaration.</li>
+ *   <li>{@link ResultType#STATISTICAL} — probabilistic inference from a
+ *       statistical model.  The {@code confidence} float is meaningful here
+ *       for ranking among candidates; for DECLARATIVE and STRUCTURAL results
+ *       it is conventionally {@code 1.0} but carries no additional 
information.</li>
+ * </ul>
  *
  * @since Apache Tika 4.0
  */
 public class EncodingResult {
 
-    /** Confidence value indicating a definitive, structural detection. */
-    public static final float CONFIDENCE_DEFINITIVE = 1.0f;
+    /**
+     * The nature of the evidence that produced this result.
+     */
+    public enum ResultType {
+        /**
+         * The document explicitly declared its encoding (BOM, HTML meta 
charset).
+         * Authoritative about author intent; preferred over inferred results 
when
+         * consistent with the actual bytes.
+         */
+        DECLARATIVE,
+        /**
+         * The encoding is proven by byte-grammar structure (ISO-2022 escape
+         * sequences, UTF-8 multibyte validation).  Not a guess — the byte
+         * patterns are only valid in this encoding.
+         */
+        STRUCTURAL,
+        /**
+         * Probabilistic inference from a statistical model.  The confidence
+         * float is meaningful for ranking among candidates.
+         */
+        STATISTICAL
+    }
 
     private final Charset charset;
     private final float confidence;
@@ -47,28 +77,53 @@ public class EncodingResult {
      * prediction without going through {@code Charset.name()}.
      */
     private final String label;
+    private final ResultType resultType;
 
     /**
+     * Constructs a STATISTICAL result. Existing detectors that do not yet
+     * classify their evidence type default to statistical (probabilistic)
+     * treatment, which is the safe, arbitratable assumption.
+     *
      * @param charset    the detected charset; must not be {@code null}
      * @param confidence detection confidence in {@code [0.0, 1.0]}
      */
     public EncodingResult(Charset charset, float confidence) {
-        this(charset, confidence, charset.name());
+        this(charset, confidence, charset.name(), ResultType.STATISTICAL);
     }
 
     /**
+     * Constructs a STATISTICAL result with a detector-specific label.
+     *
      * @param charset    the detected charset; must not be {@code null}
      * @param confidence detection confidence in {@code [0.0, 1.0]}
      * @param label      the detector's original label (e.g. {@code 
"IBM420-ltr"});
      *                   if {@code null}, defaults to {@code charset.name()}
      */
     public EncodingResult(Charset charset, float confidence, String label) {
+        this(charset, confidence, label, ResultType.STATISTICAL);
+    }
+
+    /**
+     * Constructs a result with an explicit {@link ResultType}.
+     *
+     * @param charset    the detected charset; must not be {@code null}
+     * @param confidence detection confidence in {@code [0.0, 1.0]}
+     * @param label      the detector's original label; if {@code null},
+     *                   defaults to {@code charset.name()}
+     * @param resultType the nature of the evidence; must not be {@code null}
+     */
+    public EncodingResult(Charset charset, float confidence, String label,
+                          ResultType resultType) {
         if (charset == null) {
             throw new IllegalArgumentException("charset must not be null");
         }
+        if (resultType == null) {
+            throw new IllegalArgumentException("resultType must not be null");
+        }
         this.charset = charset;
         this.confidence = Math.max(0f, Math.min(1f, confidence));
         this.label = (label != null) ? label : charset.name();
+        this.resultType = resultType;
     }
 
     public Charset getCharset() {
@@ -76,13 +131,25 @@ public class EncodingResult {
     }
 
     /**
-     * Detection confidence in {@code [0.0, 1.0]}.
-     * {@code 1.0} means definitive; lower values invite arbitration.
+     * Detection confidence in {@code [0.0, 1.0]}.  Meaningful for ranking
+     * among {@link ResultType#STATISTICAL} candidates.  For
+     * {@link ResultType#DECLARATIVE} and {@link ResultType#STRUCTURAL} results
+     * the value is conventionally {@code 1.0} but carries no additional
+     * information beyond the type itself.
      */
     public float getConfidence() {
         return confidence;
     }
 
+    /**
+     * The nature of the evidence that produced this result.
+     *
+     * @see ResultType
+     */
+    public ResultType getResultType() {
+        return resultType;
+    }
+
     /**
      * The detector's original label for this result.  Usually identical to
      * {@link #getCharset()}{@code .name()}, but preserved when the detector
@@ -97,6 +164,7 @@ public class EncodingResult {
     public String toString() {
         String cs = charset.name();
         String lbl = label.equals(cs) ? cs : label + "(" + cs + ")";
-        return lbl + "@" + String.format(java.util.Locale.ROOT, "%.2f", 
confidence);
+        return lbl + "@" + String.format(java.util.Locale.ROOT, "%.2f", 
confidence)
+                + "[" + resultType + "]";
     }
 }
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
new file mode 100644
index 0000000000..385f3edfbe
--- /dev/null
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/MetadataCharsetDetector.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.detect;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.tika.config.TikaComponent;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Encoding detector that extracts a declared charset from Tika metadata 
without
+ * reading any bytes from the stream.  Returns a single
+ * {@link EncodingResult.ResultType#DECLARATIVE} result when a charset is 
found.
+ *
+ * <p>Two metadata keys are consulted in order:
+ * <ol>
+ *   <li>{@link Metadata#CONTENT_TYPE} — the {@code charset} parameter of the
+ *       HTTP/MIME Content-Type header (e.g. {@code text/html; 
charset=UTF-8}).</li>
+ *   <li>{@link Metadata#CONTENT_ENCODING} — a bare charset label set by 
parsers
+ *       such as {@code RFC822Parser}, which splits Content-Type into a bare
+ *       media-type key and a separate charset key.</li>
+ * </ol>
+ *
+ * <p>This detector is SPI-loaded in {@code tika-core} and therefore always 
present
+ * in the default encoding-detector chain.  Its DECLARATIVE result is visible 
to
+ * {@code CharSoupEncodingDetector}, which can weigh it against structural or
+ * statistical evidence from other detectors.</p>
+ *
+ * @since Apache Tika 4.0
+ */
+@TikaComponent(name = "metadata-charset-detector")
+public class MetadataCharsetDetector implements EncodingDetector {
+
+    @Override
+    public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
+                                       ParseContext context) throws 
IOException {
+        Charset cs = charsetFromContentType(metadata);
+        if (cs == null) {
+            cs = charsetFromContentEncoding(metadata);
+        }
+        if (cs == null) {
+            return Collections.emptyList();
+        }
+        return List.of(new EncodingResult(cs, 1.0f, cs.name(),
+                EncodingResult.ResultType.DECLARATIVE));
+    }
+
+    /**
+     * Returns the charset named in the {@code charset} parameter of the
+     * {@link Metadata#CONTENT_TYPE} value, or {@code null} if absent or 
unparseable.
+     */
+    public static Charset charsetFromContentType(Metadata metadata) {
+        String contentType = metadata.get(Metadata.CONTENT_TYPE);
+        if (contentType == null) {
+            return null;
+        }
+        MediaType mediaType = MediaType.parse(contentType);
+        if (mediaType == null) {
+            return null;
+        }
+        String label = mediaType.getParameters().get("charset");
+        return parseCharset(label);
+    }
+
+    /**
+     * Returns the charset named in {@link Metadata#CONTENT_ENCODING}, or
+     * {@code null} if absent or unparseable.  This key is used by
+     * {@code RFC822Parser} to expose the charset declared in MIME body-part
+     * headers when the bare media type is stored separately in
+     * {@link Metadata#CONTENT_TYPE}.
+     */
+    public static Charset charsetFromContentEncoding(Metadata metadata) {
+        return parseCharset(metadata.get(Metadata.CONTENT_ENCODING));
+    }
+
+    private static Charset parseCharset(String label) {
+        if (label == null || label.isBlank()) {
+            return null;
+        }
+        Charset cs;
+        try {
+            cs = Charset.forName(label.trim());
+        } catch (IllegalArgumentException e) {
+            return null;
+        }
+        return normalizeWhatwg(cs);
+    }
+
+    /**
+     * Applies the critical WHATWG encoding-label normalizations that are 
universally
+     * applicable regardless of content type.  The WHATWG encoding spec
+     * (https://encoding.spec.whatwg.org/) maps {@code ISO-8859-1}, {@code 
US-ASCII},
+     * and their aliases to {@code windows-1252} because real-world content 
labeled
+     * with these names is almost always actually windows-1252.
+     */
+    private static Charset normalizeWhatwg(Charset cs) {
+        if (cs == null) {
+            return null;
+        }
+        String name = cs.name();
+        if (StandardCharsets.ISO_8859_1.name().equals(name)
+                || StandardCharsets.US_ASCII.name().equals(name)) {
+            try {
+                return Charset.forName("windows-1252");
+            } catch (IllegalArgumentException e) {
+                return cs;
+            }
+        }
+        return cs;
+    }
+}
diff --git 
a/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java 
b/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java
index 3c3ddfa627..e300adccac 100644
--- 
a/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java
+++ 
b/tika-core/src/main/java/org/apache/tika/detect/OverrideEncodingDetector.java
@@ -85,7 +85,8 @@ public class OverrideEncodingDetector implements 
EncodingDetector {
     @Override
     public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
                                        ParseContext parseContext) throws 
IOException {
-        return List.of(new EncodingResult(charset, 
EncodingResult.CONFIDENCE_DEFINITIVE));
+        return List.of(new EncodingResult(charset, 1.0f, charset.name(),
+                EncodingResult.ResultType.DECLARATIVE));
     }
 
     public Charset getCharset() {
diff --git 
a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
 
b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
index 4a812de77e..1c321a2921 100644
--- 
a/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
+++ 
b/tika-core/src/main/resources/META-INF/services/org.apache.tika.detect.EncodingDetector
@@ -13,3 +13,13 @@
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 
+# Both detectors are in the org.apache.tika.detect.* namespace and sort before
+# all other detectors, ensuring DECLARATIVE evidence from in-band signals (BOM)
+# and out-of-band declarations (HTTP/MIME headers) reaches 
CharSoupEncodingDetector
+# before any statistical detector runs.
+#
+# Within the namespace, class-name order is: BOMDetector < 
MetadataCharsetDetector
+# so BOM evidence (highest confidence) is recorded first.
+org.apache.tika.detect.BOMDetector
+org.apache.tika.detect.MetadataCharsetDetector
+
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
index 3a60e551ba..35ead7bb8a 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java
@@ -71,8 +71,8 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
      * Symmetric confusable peer groups: within each group, encoding variants
      * (e.g. ISO-8859-6 vs windows-1256) produce different decoded text for the
      * same byte sequence (unlike ISO-8859-1 vs windows-1252 which are 
functional
-     * supersets). When the language-quality winner and a CONFIDENCE_DEFINITIVE
-     * declaration are in the same peer group, the language model cannot 
reliably
+     * supersets). When the language-quality winner and a DECLARATIVE result
+     * are in the same peer group, the language model cannot reliably
      * distinguish them — it merely reflects which variant happens to produce
      * Arabic (or Cyrillic, …) n-grams its training data favoured.
      * In that case we prefer the explicit declaration.
@@ -158,25 +158,31 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
         CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector();
         Charset bestCharset = langDetector.compareLanguageSignal(candidates);
         if (bestCharset == null) {
+            // Language signal inconclusive. When a DECLARATIVE result (HTML 
meta charset,
+            // BOM, HTTP Content-Type) exists and decodes the bytes at least 
as cleanly as
+            // the statistical fallback, trust the declaration. This covers:
+            //  • Pure-ASCII probe (both decodings identical) — prefer the 
declared charset.
+            //  • Probe with high bytes that are valid in BOTH charsets (e.g. 
Cyrillic in a
+            //    page that starts with ASCII JavaScript) — the bytes look 
"clean" in both
+            //    windows-1252 (decoded as Latin Extended) and windows-1251 
(decoded as
+            //    Cyrillic), so junkRatio cannot distinguish them; trust the 
declaration.
             Charset fallback = firstResult.getCharset();
             String fallbackDecoded = candidates.get(fallback);
             float fallbackJunk = fallbackDecoded != null
                     ? CharSoupLanguageDetector.junkRatio(fallbackDecoded) : 1f;
 
-            // If the fallback charset produces garbled output (replacement 
chars) but
-            // a definitive declaration decodes the bytes cleanly, the probe 
was likely
-            // too short or ASCII-only. Trust the explicit declaration in that 
case.
             Charset cleanerDeclared = null;
-            if (fallbackJunk > 0f) {
-                for (EncodingDetectorContext.Result r : context.getResults()) {
-                    if (r.getConfidence() >= 
EncodingResult.CONFIDENCE_DEFINITIVE) {
-                        String declaredDecoded = 
candidates.get(r.getCharset());
-                        float declaredJunk = declaredDecoded != null
-                                ? 
CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f;
-                        if (declaredJunk < fallbackJunk / 2) {
-                            cleanerDeclared = r.getCharset();
-                            break;
-                        }
+            for (EncodingDetectorContext.Result r : context.getResults()) {
+                if (r.getResultType() == 
EncodingResult.ResultType.DECLARATIVE) {
+                    String declaredDecoded = candidates.get(r.getCharset());
+                    float declaredJunk = declaredDecoded != null
+                            ? 
CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f;
+                    // Trust the declaration when it decodes at least as 
cleanly as
+                    // the statistical fallback (≤ junk). A declaration that 
produces
+                    // MORE junk than the fallback is likely wrong (e.g. a 
lying BOM).
+                    if (declaredJunk <= fallbackJunk) {
+                        cleanerDeclared = r.getCharset();
+                        break;
                     }
                 }
             }
@@ -188,22 +194,22 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
             bestCharset = fallback;
         }
 
-        // If a structurally-declared charset (CONFIDENCE_DEFINITIVE, e.g. 
HTML meta tag)
-        // decodes the bytes to the same string as the language-quality 
winner, prefer
-        // the declaration. This validates the HTML header against the actual 
bytes:
-        // if they are functionally equivalent, trust the author's stated 
encoding.
-        // If they produce different text (a real conflict), the bytes win.
+        // If a DECLARATIVE result (e.g. HTML meta charset) decodes the bytes 
to the same
+        // string as the language-quality winner, prefer the declaration. This 
validates the
+        // declared encoding against the actual bytes: if they are 
functionally equivalent,
+        // trust the author's stated encoding. If they produce different text 
(a real conflict
+        // — e.g. a lying BOM or a wrong meta tag), the bytes win and the 
language scorer's
+        // choice stands.
         //
-        // Additionally, when the winner and the declared charset are in the 
same
-        // confusable peer group (e.g. ISO-8859-6 vs windows-1256) and the 
declared
-        // charset decodes cleanly (low junk ratio), the language model cannot
-        // reliably distinguish them — they both produce valid same-script 
text.
-        // In that case, prefer the explicit declaration over the model's 
guess.
+        // Additionally, when the winner and a DECLARATIVE charset are in the 
same confusable
+        // peer group (e.g. ISO-8859-6 vs windows-1256) and the declared 
charset decodes
+        // cleanly (low junk ratio), the language model cannot reliably 
distinguish them —
+        // they both produce valid same-script text. Prefer the explicit 
declaration.
         String winnerDecoded = candidates.get(bestCharset);
         float winnerJunk = winnerDecoded != null ? 
CharSoupLanguageDetector.junkRatio(winnerDecoded) : 1f;
         if (winnerDecoded != null) {
             for (EncodingDetectorContext.Result r : context.getResults()) {
-                if (r.getConfidence() >= EncodingResult.CONFIDENCE_DEFINITIVE
+                if (r.getResultType() == EncodingResult.ResultType.DECLARATIVE
                         && !r.getCharset().equals(bestCharset)) {
                     Charset declared = r.getCharset();
                     String declaredDecoded = candidates.get(declared);
@@ -214,11 +220,9 @@ public class CharSoupEncodingDetector implements 
MetaEncodingDetector {
                         context.setArbitrationInfo("scored-prefer-declared");
                         return declared;
                     }
-                    // When the winner and the declared charset are in the 
same confusable
-                    // peer group (e.g. ISO-8859-6 vs windows-1256), and the 
declared
-                    // charset decodes at least as cleanly as the winner (not 
junkier),
-                    // prefer the explicit declaration — the language model 
cannot reliably
-                    // distinguish same-script encoding variants.
+                    // Same-script peer group: language model cannot 
distinguish variants
+                    // (e.g. ISO-8859-6 vs windows-1256 both produce valid 
Arabic text).
+                    // Prefer the declaration when it decodes at least as 
cleanly as the winner.
                     float declaredJunk = 
CharSoupLanguageDetector.junkRatio(declaredDecoded);
                     if (arePeers(bestCharset, declared) && declaredJunk <= 
winnerJunk) {
                         
context.setArbitrationInfo("scored-prefer-declared-peer");
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
index 832e195d07..f4f24307cf 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetectorTest.java
@@ -191,10 +191,14 @@ public class CharSoupEncodingDetectorTest {
         assertEquals(0.25f,
                 CharSoupLanguageDetector.junkRatio("abc\u0080"), 0.001f);
 
-        // Mixed: \r\n are control chars too
-        assertEquals(2f / 13f,
+        // \r and \n are ordinary whitespace — not junk
+        assertEquals(0f,
                 CharSoupLanguageDetector.junkRatio("hello world\r\n"), 0.001f);
 
+        // Non-whitespace C1 control char mixed with ordinary whitespace
+        assertEquals(1f / 14f,
+                CharSoupLanguageDetector.junkRatio("hello world\r\n\u0080"), 
0.001f);
+
         // Empty/null
         assertEquals(0f, CharSoupLanguageDetector.junkRatio(""), 0.001f);
         assertEquals(0f, CharSoupLanguageDetector.junkRatio(null), 0.001f);
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
index 9487cad414..c2fe6dac76 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/HtmlEncodingDetector.java
@@ -171,7 +171,8 @@ public class HtmlEncodingDetector implements 
EncodingDetector {
         if (charset == null) {
             return Collections.emptyList();
         }
-        return List.of(new EncodingResult(charset, 
EncodingResult.CONFIDENCE_DEFINITIVE));
+        return List.of(new EncodingResult(charset, 1.0f, charset.name(),
+                EncodingResult.ResultType.DECLARATIVE));
     }
 
     //returns null if no charset was found
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
index 9c7ff115eb..8c7ca1536f 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-html/src/main/java/org/apache/tika/parser/html/charsetdetector/StandardHtmlEncodingDetector.java
@@ -16,8 +16,6 @@
  */
 package org.apache.tika.parser.html.charsetdetector;
 
-import static 
org.apache.tika.parser.html.charsetdetector.CharsetAliases.getCharsetByLabel;
-
 import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.Charset;
@@ -29,54 +27,53 @@ import org.apache.commons.io.input.BoundedInputStream;
 import org.apache.tika.config.TikaComponent;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.detect.MetadataCharsetDetector;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 
 /**
- * An encoding detector that tries to respect the spirit of the HTML spec
- * part 12.2.3 "The input byte stream", or at least the part that is 
compatible with
- * the implementation of tika.
- * <p>
- * https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream
- * <p>
- * If a resource was fetched over HTTP, then HTTP headers should be added to 
tika metadata
- * when using {@link #detect}, especially {@link Metadata#CONTENT_TYPE}, as it 
may contain
- * charset information.
- * <p>
- * This encoding detector may return null if no encoding is detected.
- * It is meant to be used inside a {@link 
org.apache.tika.detect.CompositeEncodingDetector}.
- * For instance:
- * <pre> {@code
- *     EncodingDetector detector = new CompositeEncodingDetector(
- *       Arrays.asList(
- *         new StandardHtmlEncodingDetector(),
- *         new Icu4jEncodingDetector()));
- * }</pre>
- * <p>
+ * An encoding detector that respects the HTML5 encoding-sniff algorithm
+ * (https://html.spec.whatwg.org/multipage/parsing.html#the-input-byte-stream):
+ * BOM → HTTP Content-Type header → {@code <meta charset>} / {@code <meta 
http-equiv>} tag.
+ *
+ * <p>When used standalone (outside a {@link 
org.apache.tika.detect.CompositeEncodingDetector}
+ * chain) this detector handles the full spec algorithm including BOM 
detection.
+ *
+ * <p>When used inside the default Tika chain (with {@code BOMDetector} and
+ * {@code MetadataCharsetDetector} already present), set {@code skipBOM=true} 
so that
+ * this detector focuses exclusively on the HTML {@code <meta>} scan.  That 
lets
+ * {@code CharSoupEncodingDetector} arbitrate between a BOM declaration and a
+ * contradicting {@code <meta>} declaration instead of silently suppressing 
one.
+ *
+ * <p>HTTP/MIME Content-Type and Content-Encoding metadata are always read 
here for
+ * standalone compatibility; in the chain they will already have been returned 
by
+ * {@code MetadataCharsetDetector} and {@code CharSoup} will handle the 
duplication
+ * gracefully (identical DECLARATIVE results agree, so no harm done).
  */
 @TikaComponent(name = "standard-html-encoding-detector")
 public final class StandardHtmlEncodingDetector implements EncodingDetector {
-    private static final int META_TAG_BUFFER_SIZE = 8192;
+    /**
+     * Default number of bytes to scan for a {@code <meta charset>} 
declaration.
+     * 65536 is large enough to cover typical {@code <script>} or {@code 
<style>}
+     * blocks in the {@code <head>} without significant overhead (encoding 
detection
+     * already buffers the stream). Users who need to handle even deeper 
declarations
+     * can raise this via {@link #setMarkLimit(int)}.
+     */
+    private static final int META_TAG_BUFFER_SIZE = 65536;
 
     private int markLimit = META_TAG_BUFFER_SIZE;
 
     /**
-     * Extracts a charset from a Content-Type HTTP header.
+     * When {@code true}, the BOM check is skipped and the detector goes 
directly to
+     * the Content-Type header and {@code <meta>} scan.  Use this when
+     * {@code BOMDetector} is already present in the chain so that
+     * {@code CharSoupEncodingDetector} can arbitrate between a BOM 
declaration and a
+     * contradicting {@code <meta charset>} rather than having the BOM 
silently win.
      *
-     * @param metadata parser metadata
-     * @return a charset if there is one specified, or null
+     * <p>Default: {@code false} (HTML5 spec-compliant standalone 
behaviour).</p>
      */
-    private static Charset charsetFromContentType(Metadata metadata) {
-        String contentType = metadata.get(Metadata.CONTENT_TYPE);
-        MediaType mediatype = MediaType.parse(contentType);
-        if (mediatype == null) {
-            return null;
-        }
-        String charsetLabel = mediatype.getParameters().get("charset");
-        return getCharsetByLabel(charsetLabel);
-    }
+    private boolean skipBOM = false;
 
     @Override
     public List<EncodingResult> detect(TikaInputStream tis, Metadata metadata,
@@ -87,10 +84,19 @@ public final class StandardHtmlEncodingDetector implements 
EncodingDetector {
                 .setInputStream(tis).setMaxCount(limit).get();
         PreScanner preScanner = new PreScanner(limitedStream);
 
-        // Priority: 1. BOM  2. Content-Type HTTP header  3. HTML <meta> tag
-        Charset detectedCharset = preScanner.detectBOM();
+        Charset detectedCharset = null;
+
+        if (!skipBOM) {
+            // HTML5 spec: BOM overrides everything.  When used standalone this
+            // detector is responsible for BOM detection; when used in the 
chain with
+            // BOMDetector, setting skipBOM=true lets CharSoup arbitrate.
+            detectedCharset = preScanner.detectBOM();
+        }
+        if (detectedCharset == null) {
+            detectedCharset = 
MetadataCharsetDetector.charsetFromContentType(metadata);
+        }
         if (detectedCharset == null) {
-            detectedCharset = charsetFromContentType(metadata);
+            detectedCharset = 
MetadataCharsetDetector.charsetFromContentEncoding(metadata);
         }
         if (detectedCharset == null) {
             detectedCharset = preScanner.scan();
@@ -100,7 +106,8 @@ public final class StandardHtmlEncodingDetector implements 
EncodingDetector {
         if (detectedCharset == null) {
             return Collections.emptyList();
         }
-        return List.of(new EncodingResult(detectedCharset, 
EncodingResult.CONFIDENCE_DEFINITIVE));
+        return List.of(new EncodingResult(detectedCharset, 1.0f,
+                detectedCharset.name(), 
EncodingResult.ResultType.DECLARATIVE));
     }
 
     public int getMarkLimit() {
@@ -108,10 +115,24 @@ public final class StandardHtmlEncodingDetector 
implements EncodingDetector {
     }
 
     /**
-     * How far into the stream to read for charset detection.
-     * Default is 8192.
+     * How far into the stream to scan for a {@code <meta charset>} 
declaration.
+     * Default is {@value #META_TAG_BUFFER_SIZE} bytes.
      */
     public void setMarkLimit(int markLimit) {
         this.markLimit = markLimit;
     }
+
+    public boolean isSkipBOM() {
+        return skipBOM;
+    }
+
+    /**
+     * When {@code true}, skip the BOM check and rely on {@code BOMDetector} 
in the
+     * chain.  This allows {@code CharSoupEncodingDetector} to arbitrate 
between a
+     * BOM and a contradicting {@code <meta charset>} declaration.
+     * Default is {@code false}.
+     */
+    public void setSkipBOM(boolean skipBOM) {
+        this.skipBOM = skipBOM;
+    }
 }
diff --git a/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml
index c96b3a7ff0..985202bf36 100644
--- a/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml
+++ b/tika-encoding-detectors/tika-encoding-detector-icu4j/pom.xml
@@ -41,6 +41,12 @@
       <artifactId>tika-core</artifactId>
       <version>${revision}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-annotation-processor</artifactId>
+      <version>${revision}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
@@ -48,6 +54,12 @@
       <type>test-jar</type>
       <scope>test</scope>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-encoding-detector-universal</artifactId>
+      <version>${revision}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-serialization</artifactId>
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
index 3cf4b435c7..04666b1726 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/java/org/apache/tika/parser/txt/CharsetDetectorTest.java
@@ -129,6 +129,7 @@ public class CharsetDetectorTest extends TikaTest {
         assertEquals("UTF-8", detector.detect().getName());
     }
 
+    @org.junit.jupiter.api.Disabled("Integration test requiring TXT parser — 
run via tika-parser-text-module")
     @Test
     public void testIgnoreCharset() throws Exception {
         //TIKA-3516, TIKA-3525, TIKA-1236
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json
new file mode 100644
index 0000000000..82442e13a2
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/configs/tika-config-ignore-charset.json
@@ -0,0 +1,13 @@
+{
+  "parsers": [
+    "default-parser"
+  ],
+  "encoding-detectors": [
+    {
+      "icu4j-encoding-detector": {
+        "ignoreCharsets": ["IBM420", "IBM424"]
+      }
+    },
+    "universal-encoding-detector"
+  ]
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt
new file mode 100644
index 0000000000..ab78d1c2f6
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/multi-language.txt
@@ -0,0 +1,58 @@
+English: ABC Category
+
+en ABC adhoc internship
+en ABC agent uno well connected
+en ABC agent uno good connections
+en ABC really need uno agent
+en ABC special retainer
+en ABC let's get this through
+en ABC let's push this through
+en ABC Token of gratitude
+en ABC do not expense
+en ABC don't expense 
+en ABC want to keep you happy
+en ABC win win payment
+en ABC win win hire
+en ABC donation uno business
+
+Portuguese: ABC Category
+
+Port ABC pagamento de sucesso
+Port ABC pagamento de sucesso
+Port ABC presentinho uno agradecimento
+Port ABC taxa especial
+Port ABC taxa especial
+Port ABC realmente preciso uno agente
+Port ABC vamos empurrar isso
+Port ABC vamos empurrar isso
+Port ABC Vamos acabar com isso
+Port ABC te deixar feliz
+Port ABC te deixar feliz
+Port ABC nao contabilize
+Port ABC n�o contabilize
+Port ABC nao contabilize
+Port ABC n�o contabilize
+Port ABC doa��o uno neg�cios
+Port ABC doacao uno negocios
+Port ABC agente uno bem conectado 
+Port ABC agente uno bons contatos
+Port ABC estagio adhoc 
+Port ABC contratacao de sucesso�
+Port ABC contrata��o de sucesso�
+
+Spanish: ABC Category
+
+espa�ol ABC adhoc pr�cticas
+espa�ol ABC intermediario uno bien conectado
+espa�ol ABC agente uno buenas conecciones
+espa�ol ABC realmente necesito uno un agente
+espa�ol ABC anticipo especial
+espa�ol ABC hay que pasar
+espa�ol ABC hay que forzar
+espa�ol ABC muestra de gratitud
+espa�ol ABC no registre gasto
+espa�ol ABC no reporte gasto
+espa�ol ABC mantenerte contento
+espa�ol ABC todos ganan con el anticipo
+espa�ol ABC donacion uno negocios
+espa�ol ABC todos ganan con la contratacion
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html
new file mode 100644
index 0000000000..3e55dcd116
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/resume.html
@@ -0,0 +1,99 @@
+<div class="js-helper">
+    <style 
type="text/css">#style_13209008630000000884_BODY{background-color:#FFFFFF;color:#000000;MARGIN:0px
 1px;font-family:Tahoma,Arial,Verdana,Sans-Serif}#style_13209008630000000884 
TD{font-size:13px;font-family:Tahoma,Arial,Verdana,Sans-Serif;vertical-align:top}#style_13209008630000000884
 
CAPTION{font-size:13px;font-weight:bold;text-align:left}#style_13209008630000000884
 TR.style_13209008630000000884thead TD{font-weight:bold;text-align:center; 
padding-bottom:6px;padding-top:6px;padd [...]
+
+    </style>
+    <div class="mr_read__body" id="style_13209008630000000884">
+        <base href="http://e.mail.ru/cgi-bin/"; target="_self"/>
+
+        <div id="style_13209008630000000884_BODY">
+
+
+            <style type="text/css"></style>
+
+
+            <table border="0" cellpadding="0" cellspacing="0" height="100%" 
width="100%">
+                <tr>
+                    <td>
+
+                    </td>
+                </tr>
+                <tr>
+                    <td height="100%" style="padding:5px">
+                        Здравствуйте, !<br>
+                        <br>
+                        Предлагаем Вам ознакомиться со списком 
зарегистрированных компаний, представители которых
+                        просмотрели Ваше резюме за последние сутки.<br>
+                        <br>
+                        <li><a 
href="/cgi-bin/link?check=1&cnf=710139&url=http%3A%2F%2;0,0" 
target="_blank">Компании,
+                            просмотревшие резюме № .</a> Новые: <b>1.</b></li>
+                        <br>
+                        <br>
+                        Эти сведения предоставляются Вам исключительно для 
информации. Вы можете оперативно отслеживать,
+                        какие именно компании нашли в базе данных Superjob 
Ваше резюме и заинтересовались им.<br>
+                        <br>
+                        Если Ваше резюме размещено в закрытом доступе, то его 
могут просматривать только те
+                        работодатели, которым Вы отправили его 
самостоятельно.<br>
+                        Историю отправки своего резюме Вы можете посмотреть по 
ссылке «История рассылки резюме».<br>
+                        <br>
+                        <br>
+                        <b>Внимание!</b><br>
+                        В процессе поиска работы Вы можете столкнуться с 
такими предложениями работодателей или кадровых
+                        агентств, в которых Вас будут просить внести оплату 
(за предварительное обучение, за оформление
+                        документов, за оформление обязательной страховки, на 
закупку первой партии продукции компании,
+                        предназначенной для продажи и т.п.) или предоставить 
отсканированные копии документов (паспорта,
+                        военного билета, трудовой книжки, водительских прав, 
пенсионного удостоверния и т.п.) для якобы
+                        предварительного оформления или подтверждения данных, 
указанных в Вашем резюме.<br>
+                        Это один из признаков мошенничества! Мы рекомендуем 
Вам очень осторожно относиться к таким
+                        предложениям и по возможности избегать собеседований с 
подобными работодателями.<br>
+                        <br>
+                        Также мы настоятельно не рекомендуем отправлять 
платные SMS-сообщения на короткие номера для
+                        получения контактов или другой информации о вакансии 
или же для получения результатов
+                        тестирования. С организациями, которые оказывают 
подобные услуги, мы не сотрудничаем и
+                        предупреждаем, что это тоже один из приемов 
мошенничества.<br>
+                        <br>
+                        <br>
+                        <em>x</em> <a 
href="/cgi-bin/link?check=1&cnf=8d972a&url=http%3A%2F%2Fwww.sup;0,0"
+                                      target="_blank">Отключить
+                        уведомления о новых просмотрах моих резюме</a><br>
+                        <br>
+                        По ссылкам в этом письме можно войти в систему без 
ввода пароля.
+                        <br><br>
+                    </td>
+                </tr>
+                <tr>
+                    <td>
+                        <span 
class="style_13209008630000000884noprint"><br><br>Если у Вас есть пожелания и 
идеи по улучшению сервиса Superjob, пожалуйста, <a
+                                href="/cgi-bin/link?check=1;0,0" 
target="_blank">напишите нам</a>.<br><br></span>
+                        <table border="0" cellpadding="10" cellspacing="0" 
class="style_13209008630000000884noprint"
+                               width="100%">
+                            <tr>
+                                <td align="center" style="border-top:1px solid 
#BACBD7;">
+                                    <a 
href="/cgi-bin/link?check=1&cnf=8fa2f9&url=http%3A%2F%2Fwww.;0,0"
+                                       target="_blank"><big>Superjob —
+                                        Работа должна доставлять 
удовольствие!</big></a>
+                                </td>
+                            </tr>
+                        </table>
+                        <table border="0" cellpadding="0" cellspacing="1" 
class="style_13209008630000000884noprint"
+                               width="100%">
+                            <tr>
+                                <td align="center" style="padding:5px">
+                                    <span 
style="color:#999999;font-size:8pt;">Письмо отправлено: xx.xx.xxxx 
xx:xx:xx</span>
+                                </td>
+                            </tr>
+                        </table>
+
+                    </td>
+                </tr>
+            </table>
+
+
+        </div>
+
+
+        <base href="http://e.mail.ru/cgi-bin/"; target="_self"/>
+    </div>
+</div>
+
+
+
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt
new file mode 100644
index 0000000000..4673e04852
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testIgnoreCharset.txt
@@ -0,0 +1,4 @@
+
+ACTIVE AGE
+
+BALM
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt
new file mode 100644
index 0000000000..519c95565a
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/testTXT_win-1252.txt
@@ -0,0 +1 @@
+These smart quotes are the trigger for CharsetRecog_sbcs to think this is a 
�windows� encoding
\ No newline at end of file
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html
 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html
new file mode 100644
index 0000000000..2aecab221d
Binary files /dev/null and 
b/tika-encoding-detectors/tika-encoding-detector-icu4j/src/test/resources/test-documents/test_ignore_IBM420.html
 differ
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
index 7a2b434d63..d920388072 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/MojibusterEncodingDetector.java
@@ -94,7 +94,23 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
          * 0x40–0xFE), so their presence is definitive proof that a GB18030 
codec
          * is required to avoid replacement characters.
          */
-        GB_FOUR_BYTE_UPGRADE
+        GB_FOUR_BYTE_UPGRADE,
+        /**
+         * Upgrade an ISO-8859-X result to its Windows-12XX equivalent when the
+         * probe contains at least one CRLF pair ({@code 0x0D 0x0A}) but no C1
+         * bytes ({@code 0x80–0x9F}).
+         *
+         * <p>Files originating on Windows use CRLF line endings.  The presence
+         * of a {@code \r\n} pair in a probe that is otherwise 7-bit ASCII (or
+         * has only high bytes above {@code 0x9F}) is weak evidence of Windows
+         * origin and therefore of a Windows code page.  {@link 
Rule#ISO_TO_WINDOWS}
+         * already handles the C1-byte case definitively; this rule covers the
+         * weaker case where C1 bytes have not been seen but CRLF line endings
+         * suggest Windows origin.  A bare {@code 0x0D} (old Mac Classic 
CR-only
+         * line ending) does <em>not</em> trigger this rule.  Mirrors the 
legacy
+         * {@code UniversalEncodingListener.report()} heuristic.</p>
+         */
+        CRLF_TO_WINDOWS
     }
 
     private static final long serialVersionUID = 1L;
@@ -278,28 +294,28 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         }
 
         byte[] probe = readProbe(input, maxProbeBytes);
+        // Strip BOM bytes before feature extraction. BOM detection is handled
+        // by BOMDetector (which runs earlier in the chain and returns 
DECLARATIVE).
+        // BOMs are excluded from training data; stripping ensures consistent
+        // model inference. A lying BOM (e.g. UTF-8 BOM on a Windows-1252 file)
+        // is caught by CharSoup comparing BOMDetector's DECLARATIVE claim 
against
+        // Mojibuster's byte-content analysis.
         probe = stripBom(probe);
-        if (probe.length == 0) {
-            return singleResult(StandardCharsets.UTF_8.name(), 
EncodingResult.CONFIDENCE_DEFINITIVE, Integer.MAX_VALUE);
-        }
-
+        // An empty probe (e.g. empty file, or a file that was only a BOM) 
falls
+        // through to detectAll where isPureAscii returns true for a 
zero-length
+        // array, yielding the same windows-1252 default as any other 
pure-ASCII probe.
         return detectAll(probe, Integer.MAX_VALUE);
     }
 
     /**
-     * Structural gates: deterministic early exits before the general model 
runs.
-     * Only bulletproof, zero-false-positive checks belong here.
-     * EBCDIC discrimination is intentionally absent — it is handled by the 
sub-model.
-     */
-    /**
-     * Applies structural encoding rules that produce CONFIDENCE_DEFINITIVE 
results.
-     * Returns non-null only when a byte-level pattern unambiguously 
identifies the charset
-     * (e.g. ISO-2022 escape sequences, sparse valid UTF-8 multibyte 
sequences).
-     * Pure ASCII is deliberately excluded here — ASCII is compatible with 
virtually all
-     * single-byte encodings, so it is NOT definitive. Use {@link 
#applyAsciiHeuristic}
-     * for the ASCII case.
+     * Applies structural encoding rules that produce {@link 
EncodingResult.ResultType#STRUCTURAL}
+     * results. Returns non-null only when a byte-level pattern unambiguously 
identifies the
+     * charset (ISO-2022 escape sequences, sparse valid UTF-8 multibyte 
sequences).
+     *
+     * Pure ASCII is deliberately excluded — ASCII is compatible with 
virtually all
+     * single-byte encodings and is not structurally definitive.
      */
-    private Charset applyDefinitiveStructuralRules(byte[] probe) {
+    private Charset applyStructuralRules(byte[] probe) {
         // ISO-2022 before ASCII: all three variants are 7-bit so checkAscii 
fires first.
         Charset iso2022 = StructuralEncodingRules.detectIso2022(probe);
         if (iso2022 != null) {
@@ -327,11 +343,11 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
     }
 
     /**
-     * Returns UTF-8 if the probe is pure 7-bit ASCII (no bytes ≥ 0x80, no 
null bytes).
-     * ASCII is a strict subset of UTF-8 and of every single-byte encoding, so 
this is
-     * a heuristic only — the confidence returned by the caller must be below
-     * CONFIDENCE_DEFINITIVE to allow downstream detectors (e.g. HTML meta 
charset) to
-     * override it without ambiguity.
+     * Returns true if the probe is pure 7-bit ASCII (no bytes ≥ 0x80, no null 
bytes).
+     * ASCII is compatible with virtually every single-byte encoding, so this 
is a
+     * heuristic — we report US-ASCII to honestly reflect what the probe 
showed.
+     * CharSoup will upgrade to a declared encoding (e.g. ISO-8859-15) when 
the document
+     * contains an explicit declaration consistent with the ASCII bytes.
      */
     private static boolean isPureAscii(byte[] probe) {
         return StructuralEncodingRules.checkAscii(probe) && 
!hasNullBytes(probe);
@@ -396,17 +412,24 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         boolean gates = enabledRules.contains(Rule.STRUCTURAL_GATES);
 
         if (gates) {
-            // Definitive structural rules (BOM, ISO-2022, sparse UTF-8) → 
CONFIDENCE_DEFINITIVE.
-            Charset definitive = applyDefinitiveStructuralRules(probe);
-            if (definitive != null) {
-                return singleResult(definitive.name(), 
EncodingResult.CONFIDENCE_DEFINITIVE, topN);
+            // Structural rules: byte-grammar proof (ISO-2022, sparse UTF-8).
+            Charset structural = applyStructuralRules(probe);
+            if (structural != null) {
+                return singleResult(structural.name(), 1.0f,
+                        EncodingResult.ResultType.STRUCTURAL, topN);
             }
-            // ASCII heuristic: pure 7-bit ASCII is valid in virtually every 
single-byte
-            // encoding, so we report UTF-8 with a sub-definitive confidence. 
This allows
-            // an HTML meta-charset declaration (which IS 
CONFIDENCE_DEFINITIVE) to override
-            // the ASCII heuristic without non-deterministic tie-breaking.
+            // Pure ASCII: no high bytes seen in the probe. We default to 
windows-1252 —
+            // the WHATWG-canonical "Western Latin, I saw only ASCII bytes" 
encoding.
+            // HTML5 explicitly defines ISO-8859-1 as an alias for 
windows-1252, making
+            // windows-1252 the right default: it is the correct superset, it 
avoids the
+            // ambiguity between ISO-8859-1 and windows-1252 in the 0x80–0x9F 
range, and
+            // it keeps the no-hint path consistent with the HTML-spec path 
(where a stated
+            // "charset=iso-8859-1" is normalized to windows-1252 by 
StandardHtmlEncodingDetector).
+            // CharSoup will further upgrade to any compatible DECLARATIVE 
encoding
+            // (e.g. an HTML meta charset=UTF-8) when one is present and 
consistent.
             if (isPureAscii(probe)) {
-                return singleResult(StandardCharsets.UTF_8.name(), 0.75f, 
topN);
+                return singleResult("windows-1252", 0.5f,
+                        EncodingResult.ResultType.STATISTICAL, topN);
             }
         }
 
@@ -420,14 +443,12 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             return runEbcdicSubModel(probe, topN);
         }
 
-        // Grammar filtering can leave the list empty on very short probes 
(e.g. a single
-        // high byte that is a valid CJK lead but has an invalid trail byte). 
Fall back to
-        // UTF-8 rather than returning empty and causing AutoDetectReader to 
throw.
+        // If the model had no evidence (probe too short or all tokens 
filtered), fall back to
+        // windows-1252 at very low confidence rather than returning empty and 
letting
+        // AutoDetectReader throw. CharSoup will override this with any 
DECLARATIVE hint.
         if (results.isEmpty()) {
-            return singleResult(StandardCharsets.UTF_8.name(),
-                    EncodingResult.CONFIDENCE_DEFINITIVE / 2, 
Integer.MAX_VALUE);
+            return singleResult("windows-1252", 0.1f, 
EncodingResult.ResultType.STATISTICAL, topN);
         }
-
         return results;
     }
 
@@ -443,11 +464,29 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             }
         }
 
-        List<EncodingResult> results = selectByLogitGap(model, logits, topN);
+        // For short probes the model has limited signal; widen the candidate 
set so
+        // that CharSoup's language arbitration can rescue the correct answer 
even when
+        // the gap between competitors exceeds LOGIT_GAP.
+        List<EncodingResult> results;
+        if (probe.length < 50) {
+            results = selectTopN(model, logits, 3);
+        } else if (probe.length < 100) {
+            results = selectTopN(model, logits, 2);
+        } else {
+            results = selectByLogitGap(model, logits, topN);
+        }
 
         if (enabledRules.contains(Rule.ISO_TO_WINDOWS) && 
StructuralEncodingRules.hasC1Bytes(probe)) {
             results = upgradeIsoToWindows(results);
         }
+        // CRLF_TO_WINDOWS: when C1 bytes were absent (ISO_TO_WINDOWS didn't 
fire) but
+        // CRLF pairs suggest Windows line endings, apply the same ISO→Windows 
upgrade as
+        // weak evidence of Windows file origin. If ISO_TO_WINDOWS already 
fired, the
+        // results are already Windows-12XX and upgradeIsoToWindows is a no-op.
+        // Bare CR (old Mac Classic line endings) does NOT trigger this rule.
+        if (enabledRules.contains(Rule.CRLF_TO_WINDOWS) && 
StructuralEncodingRules.hasCrlfBytes(probe)) {
+            results = upgradeIsoToWindows(results);
+        }
         if (enabledRules.contains(Rule.CJK_GRAMMAR)) {
             results = refineCjkResults(probe, results);
         }
@@ -475,6 +514,78 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
      * hiding plausible alternatives that downstream arbitrators (e.g. 
CharSoup)
      * should evaluate. Linear confidence within the gap preserves the 
signal.</p>
      */
+    /**
+     * Maximum confidence assigned to a STATISTICAL model result.  Kept 
strictly
+     * below 1.0 so that statistical results are never mistaken for STRUCTURAL 
or
+     * DECLARATIVE evidence by downstream arbitrators (e.g. 
CharSoupEncodingDetector).
+     * The top result from the logit-gap window always maps to this value.
+     */
+    private static final float MAX_STATISTICAL_CONFIDENCE = 0.99f;
+
+    /**
+     * Return the top {@code n} single-byte/CJK candidates by logit rank, 
regardless of gap.
+     * Used for short probes where the model has limited signal and we want
+     * CharSoup's language arbitration to have multiple candidates to compare.
+     * <p>
+     * Wide encodings (UTF-16/32) are excluded: stride-2 features can 
spuriously
+     * boost them on very short probes that lack the null-byte density that
+     * genuinely characterises UTF-16/32. A 9-byte filename without a BOM is
+     * never UTF-16/32 in practice.
+     * <p>
+     * Confidence is still scaled relative to the logit-gap window so that
+     * results remain in the statistical range below DECLARATIVE/STRUCTURAL.
+     */
+    private static List<EncodingResult> selectTopN(LinearModel m, float[] 
logits, int n) {
+        // Collect all positive-logit, non-excluded candidates with their 
array index.
+        // We sort by RAW LOGIT (not sigmoid) so that the model's actual 
ranking is
+        // preserved even when all logits are large-positive (sigmoid ≈ 1.0 
for all).
+        // Example: GB18030=43, EUC-JP=28, Big5=21 — all sigmoid≈0.99 — would 
tie on
+        // sigmoid and fall back to label-insertion order; sorting by logit 
keeps GB18030 first.
+        List<int[]> candidates = new ArrayList<>(); // [label-index]
+        for (int i = 0; i < logits.length; i++) {
+            // logit ≤ 0 means sigmoid ≤ 0.5 — the model actively disfavours 
this encoding.
+            if (logits[i] <= 0) {
+                continue;
+            }
+            String lbl = m.getLabel(i);
+            if (isExcludedFromShortProbe(lbl)) {
+                continue;
+            }
+            if (labelToCharset(lbl) == null) {
+                continue;
+            }
+            candidates.add(new int[]{i});
+        }
+        // Sort descending by logit so model ranking is preserved.
+        candidates.sort((a, b) -> Float.compare(logits[b[0]], logits[a[0]]));
+
+        // Take the top N and assign sigmoid confidence so downstream code has 
a meaningful score.
+        List<EncodingResult> result = new ArrayList<>(Math.min(n, 
candidates.size()));
+        for (int rank = 0; rank < Math.min(n, candidates.size()); rank++) {
+            int i = candidates.get(rank)[0];
+            String lbl = m.getLabel(i);
+            Charset cs = labelToCharset(lbl);
+            float conf = (1f / (1f + (float) Math.exp(-logits[i]))) * 
MAX_STATISTICAL_CONFIDENCE;
+            result.add(new EncodingResult(cs, conf, lbl, 
EncodingResult.ResultType.STATISTICAL));
+        }
+        return result;
+    }
+
+    /**
+     * Returns true for encodings that should be excluded from short-probe 
top-N selection.
+     * <ul>
+     *   <li>Wide encodings (UTF-16/32): stride-2 features spuriously boost 
them on short
+     *       probes that lack the null-byte density that genuinely 
characterises UTF-16/32.</li>
+     *   <li>EBCDIC family (IBM4xx, IBM500, "EBCDIC" routing label): EBCDIC 
has its own
+     *       dedicated sub-model pipeline and should never surface as a 
candidate for
+     *       short single-byte or CJK content.</li>
+     * </ul>
+     */
+    private static boolean isExcludedFromShortProbe(String label) {
+        return label.startsWith("UTF-16") || label.startsWith("UTF-32")
+                || label.startsWith("IBM") || label.equals("EBCDIC");
+    }
+
     private static List<EncodingResult> selectByLogitGap(LinearModel m, 
float[] logits, int topN) {
         float maxLogit = Float.NEGATIVE_INFINITY;
         for (float l : logits) {
@@ -486,11 +597,14 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         List<EncodingResult> results = new ArrayList<>();
         for (int i = 0; i < logits.length; i++) {
             if (logits[i] >= floor) {
-                float conf = (logits[i] - floor) / LOGIT_GAP;
+                // Scale to [0, MAX_STATISTICAL_CONFIDENCE] so no statistical 
result
+                // reaches 1.0, keeping the range unambiguously below 
STRUCTURAL/DECLARATIVE.
+                float conf = ((logits[i] - floor) / LOGIT_GAP) * 
MAX_STATISTICAL_CONFIDENCE;
                 String lbl = m.getLabel(i);
                 Charset cs = labelToCharset(lbl);
                 if (cs != null) {
-                    results.add(new EncodingResult(cs, conf, lbl));
+                    results.add(new EncodingResult(cs, conf, lbl,
+                            EncodingResult.ResultType.STATISTICAL));
                 }
             }
         }
@@ -552,7 +666,11 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             return results;
         }
 
-        // Score every CJK charset in the result list.
+        // Grammar-filter CJK charsets: drop those that produce invalid byte 
sequences
+        // (score == 0 means the grammar walker found bad bytes — the model 
was wrong).
+        // Charsets that pass grammar keep their model confidence unchanged so 
that
+        // all candidates remain on the same sigmoid scale for CharSoup to 
compare.
+        // Non-CJK charsets pass through unchanged.
         List<EncodingResult> refined = new ArrayList<>(results.size());
         for (EncodingResult er : results) {
             if (!CjkEncodingRules.isCjk(er.getCharset())) {
@@ -561,17 +679,13 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
             }
             int score = CjkEncodingRules.match(probe, er.getCharset());
             if (score == 0) {
-                // grammar rejects this charset — drop entirely
-            } else if (score >= CjkEncodingRules.CLEAN_SHORT_PROBE_CONFIDENCE) 
{
-                // structurally clean (bad == 0) — use grammar confidence
-                refined.add(new EncodingResult(er.getCharset(), score / 100f));
-            } else {
-                // some bad bytes within tolerance — keep model confidence
-                refined.add(er);
+                // grammar rejects this charset entirely — drop it
+                continue;
             }
+            // Grammar passes: keep the model's sigmoid confidence so 
everything
+            // is on the same scale when CharSoup compares candidates.
+            refined.add(er);
         }
-
-        refined.sort((a, b) -> Float.compare(b.getConfidence(), 
a.getConfidence()));
         return refined;
     }
 
@@ -617,7 +731,8 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         return upgraded;
     }
 
-    private static List<EncodingResult> singleResult(String label, float 
confidence, int topN) {
+    private static List<EncodingResult> singleResult(String label, float 
confidence,
+                                                      
EncodingResult.ResultType type, int topN) {
         if (topN <= 0) {
             return Collections.emptyList();
         }
@@ -625,7 +740,7 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         if (cs == null) {
             return Collections.emptyList();
         }
-        return List.of(new EncodingResult(cs, confidence, label));
+        return List.of(new EncodingResult(cs, confidence, label, type));
     }
 
     /**
@@ -669,6 +784,10 @@ public class MojibusterEncodingDetector implements 
EncodingDetector {
         return model;
     }
 
+    public LinearModel getEbcdicModel() {
+        return ebcdicModel;
+    }
+
     public EnumSet<Rule> getEnabledRules() {
         return EnumSet.copyOf(enabledRules.isEmpty() ? 
EnumSet.noneOf(Rule.class) : enabledRules);
     }
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
index a87dd71410..40d409cae6 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/StructuralEncodingRules.java
@@ -373,6 +373,35 @@ public final class StructuralEncodingRules {
      */
     private static final double IBM500_LATIN_THRESHOLD = 0.25;
 
+    /**
+     * Returns {@code true} if the probe contains at least one CRLF pair
+     * ({@code 0x0D 0x0A}).
+     *
+     * <p>Files originating on Windows use CRLF as the line separator.
+     * The presence of a {@code 0x0D 0x0A} pair in a probe that is otherwise
+     * 7-bit ASCII is weak evidence that the file was created on Windows and
+     * therefore more likely to use a Windows code page (e.g. windows-1252)
+     * than a Unix-origin ISO-8859-X encoding for any high-byte content
+     * beyond the probe window.</p>
+     *
+     * <p>A bare {@code 0x0D} without a following {@code 0x0A} is <em>not</em>
+     * counted: classic Mac OS used bare CR as its line ending, and that is a
+     * different case that does not imply Windows origin.</p>
+     */
+    public static boolean hasCrlfBytes(byte[] bytes) {
+        return hasCrlfBytes(bytes, 0, bytes.length);
+    }
+
+    public static boolean hasCrlfBytes(byte[] bytes, int offset, int length) {
+        int end = offset + length;
+        for (int i = offset; i < end - 1; i++) {
+            if (bytes[i] == 0x0D && bytes[i + 1] == 0x0A) {
+                return true;
+            }
+        }
+        return false;
+    }
+
     /**
      * Returns {@code true} if the probe contains any byte in the C1 control
      * range {@code 0x80–0x9F}.
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin
index f76be8560d..191cdaeedf 100644
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin
 and 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect-ebcdic.bin
 differ
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
index 31dd657c92..70421e6bb8 100644
Binary files 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
 and 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/resources/org/apache/tika/ml/chardetect/chardetect.bin
 differ
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
new file mode 100644
index 0000000000..60c0803d43
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/EbcdicRoutingTest.java
@@ -0,0 +1,132 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static org.junit.jupiter.api.Assertions.assertFalse;
+import static org.junit.jupiter.api.Assertions.assertNotEquals;
+import static org.junit.jupiter.api.Assertions.assertTrue;
+
+import java.nio.charset.Charset;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.jupiter.api.BeforeAll;
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.ml.LinearModel;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Verifies the two-phase EBCDIC detection pipeline:
+ * <ol>
+ *   <li>General model emits {@code "EBCDIC"} routing label for EBCDIC-family 
bytes.</li>
+ *   <li>{@code MojibusterEncodingDetector} routes to the EBCDIC sub-model, 
which
+ *       returns a specific IBM variant (IBM500, IBM420, IBM855, etc.) — never 
the
+ *       bare {@code "EBCDIC"} routing label.</li>
+ * </ol>
+ */
+public class EbcdicRoutingTest {
+
+    private static MojibusterEncodingDetector detector;
+
+    // Representative English prose encoded in IBM500 (International EBCDIC).
+    // Generated via: text.getBytes(Charset.forName("IBM500"))
+    private static final byte[] IBM500_BYTES = makeEbcdic("IBM500",
+            "The quick brown fox jumps over the lazy dog. " +
+            "This sentence contains every letter of the English alphabet. " +
+            "EBCDIC encoding is used on IBM mainframe systems. " +
+            "Fields are often fixed-width and space-padded in EBCDIC files.");
+
+    // Russian text encoded in IBM855 (Cyrillic EBCDIC).
+    private static final byte[] IBM855_BYTES = makeEbcdic("IBM855",
+            "\u041f\u0440\u0438\u0432\u0435\u0442 \u043c\u0438\u0440! " +  // 
Привет мир!
+            "\u042d\u0442\u043e \u0442\u0435\u043a\u0441\u0442 \u043d\u0430 " 
+ // Это текст на
+            "\u0440\u0443\u0441\u0441\u043a\u043e\u043c 
\u044f\u0437\u044b\u043a\u0435. " + // русском языке.
+            "\u041a\u043e\u0434\u0438\u0440\u043e\u0432\u043a\u0430 IBM855 " + 
// Кодировка IBM855
+            
"\u0438\u0441\u043f\u043e\u043b\u044c\u0437\u0443\u0435\u0442\u0441\u044f " + 
// используется
+            "\u043d\u0430 
\u043c\u0435\u0439\u043d\u0444\u0440\u0435\u0439\u043c\u0430\u0445."); // на 
мейнфреймах.
+
+    private static byte[] makeEbcdic(String charsetName, String text) {
+        try {
+            return text.getBytes(Charset.forName(charsetName));
+        } catch (Exception e) {
+            throw new RuntimeException("Cannot encode test data as " + 
charsetName, e);
+        }
+    }
+
+    @BeforeAll
+    static void setUp() {
+        detector = new MojibusterEncodingDetector();
+    }
+
+    /**
+     * The general model must have exactly one EBCDIC routing label.
+     * Individual IBM variants must NOT appear as top-level labels — they live
+     * only in the EBCDIC sub-model.
+     */
+    @Test
+    public void generalModelHasSingleEbcdicRoutingLabel() {
+        LinearModel general = detector.getModel();
+        String[] labels = general.getLabels();
+
+        assertTrue(Arrays.asList(labels).contains("EBCDIC"),
+                "General model must have an 'EBCDIC' routing label");
+
+        // No individual IBM variant should appear as a direct label in the 
general model —
+        // they live only in the EBCDIC sub-model
+        for (String label : labels) {
+            assertFalse(label.startsWith("IBM"),
+                    "General model must not contain individual IBM variant: " 
+ label);
+        }
+    }
+
+    /**
+     * IBM500 bytes must route through the sub-model and return a specific IBM 
variant,
+     * not the bare "EBCDIC" routing label.
+     */
+    @Test
+    public void ibm500RoutesToSubModel() throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(IBM500_BYTES)) {
+            List<EncodingResult> results = detector.detect(tis, new 
Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(), "Should detect something for IBM500 
bytes");
+            String topLabel = results.get(0).getLabel();
+            assertNotEquals("EBCDIC", topLabel,
+                    "Result must be a specific IBM variant, not the routing 
label");
+            assertTrue(topLabel.startsWith("IBM"),
+                    "Result should be an IBM variant, got: " + topLabel);
+        }
+    }
+
+    /**
+     * IBM855 (Cyrillic EBCDIC) bytes must similarly route through the 
sub-model.
+     */
+    @Test
+    public void ibm855RoutesToSubModel() throws Exception {
+        try (TikaInputStream tis = TikaInputStream.get(IBM855_BYTES)) {
+            List<EncodingResult> results = detector.detect(tis, new 
Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(), "Should detect something for IBM855 
bytes");
+            String topLabel = results.get(0).getLabel();
+            assertNotEquals("EBCDIC", topLabel,
+                    "Result must be a specific IBM variant, not the routing 
label");
+            assertTrue(topLabel.startsWith("IBM"),
+                    "Result should be an IBM variant, got: " + topLabel);
+        }
+    }
+}
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
index 8c94ec1e62..e5ff43e10e 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/ZipFilenameDetectionTest.java
@@ -20,31 +20,34 @@ import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.nio.charset.Charset;
-import java.util.Arrays;
 import java.util.List;
-import java.util.Locale;
 
 import org.junit.jupiter.api.Test;
 
+import org.apache.tika.detect.DefaultEncodingDetector;
 import org.apache.tika.detect.EncodingDetectorContext;
 import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.ml.LinearModel;
 import org.apache.tika.parser.ParseContext;
 
 /**
- * Diagnostic: raw logits and CharSoup arbitration for Shift-JIS zip entry 
name bytes.
+ * Integration tests for charset detection of short byte sequences typical of
+ * ZIP entry names — a particularly hard case because the probes are tiny (6-23
+ * bytes) and structurally valid in several encodings simultaneously.
  *
- * The v2 model (28 classes) removes the UTF-16/32 labels that were confusing 
the model.
- * With v2, Shift-JIS (logit ~10.5) scores clearly above GB18030 (logit ~6.2) 
for the
- * bytes "文章1.txt" in Shift-JIS encoding.
+ * Detection strategy: Mojibuster ranks candidates by raw logit; CharSoup
+ * arbitrates using language signal (positive max-logit wins).
  */
 public class ZipFilenameDetectionTest {
 
     // 文章1.txt in Shift-JIS (9 raw bytes from a real zip entry)
-    private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874");
+    private static final byte[] SJIS_RAW  = hexToBytes("95b68fcd312e747874");
+    // 文章2.txt in Shift-JIS (same but '2' instead of '1')
+    private static final byte[] SJIS_RAW2 = hexToBytes("95b68fcd322e747874");
+    // 审计压缩包文件检索测试/ in GBK (23 bytes from gbk.zip)
+    private static final byte[] GBK_RAW   = 
hexToBytes("c9f3bcc6d1b9cbf5b0fccec4bcfebceccbf7b2e2cad42f");
 
     private static byte[] hexToBytes(String hex) {
         byte[] b = new byte[hex.length() / 2];
@@ -54,79 +57,21 @@ public class ZipFilenameDetectionTest {
         return b;
     }
 
-    private boolean isWideUnicode(String label) {
-        return label.startsWith("UTF-16") || label.startsWith("UTF-32");
-    }
-
-    @Test
-    public void printModelLabels() throws Exception {
-        LinearModel model = new MojibusterEncodingDetector().getModel();
-        String[] labels = model.getLabels();
-        System.out.println("Model labels (" + labels.length + "):");
-        for (String l : labels) {
-            System.out.println("  " + l);
-        }
-        long wideCount = 
Arrays.stream(labels).filter(this::isWideUnicode).count();
-        System.out.println("Wide-unicode labels in model: " + wideCount
-                + " (detected natively via stride-2 features)");
-        assertTrue(wideCount >= 4, "Model should have UTF-16/32 labels (LE+BE 
for each)");
-    }
-
-    @Test
-    public void diagnoseLogits() throws Exception {
-        MojibusterEncodingDetector detector = new MojibusterEncodingDetector();
-        LinearModel model = detector.getModel();
-        ByteNgramFeatureExtractor extractor =
-                new ByteNgramFeatureExtractor(model.getNumBuckets());
-        String[] labels = model.getLabels();
-
-        float[] logits = model.predictLogits(extractor.extract(SJIS_RAW));
-
-        Integer[] idx = new Integer[labels.length];
-        for (int i = 0; i < idx.length; i++) {
-            idx[i] = i;
-        }
-        Arrays.sort(idx, (a, b) -> Float.compare(logits[b], logits[a]));
-
-        System.out.printf(Locale.ROOT, "%n=== Raw logits for 文章1.txt (9 bytes) 
===%n");
-        System.out.printf(Locale.ROOT, "%-24s %8s%n", "charset", "logit");
-        System.out.println("-".repeat(35));
-        float shiftJisLogit = Float.NEGATIVE_INFINITY;
-        float gb18030Logit = Float.NEGATIVE_INFINITY;
-        for (int rank = 0; rank < labels.length; rank++) {
-            int i = idx[rank];
-            boolean cjk = labels[i].contains("JIS") || labels[i].contains("GB")
-                    || labels[i].contains("Big5") || labels[i].contains("EUC");
-            if (rank < 6 || cjk) {
-                System.out.printf(Locale.ROOT, "  %-24s %8.2f%n", labels[i], 
logits[i]);
-            }
-            if ("Shift_JIS".equals(labels[i])) {
-                shiftJisLogit = logits[i];
-            } else if ("GB18030".equals(labels[i])) {
-                gb18030Logit = logits[i];
-            }
-        }
-        // Verify Shift-JIS ranks ahead of GB18030 on raw (un-tiled) bytes.
-        // ZipParser no longer tiles short filenames, so this is the actual 
input.
-        assertTrue(shiftJisLogit > gb18030Logit,
-                String.format(Locale.ROOT,
-                        "Shift_JIS logit (%.2f) should beat GB18030 logit 
(%.2f)",
-                        shiftJisLogit, gb18030Logit));
-    }
-
     /**
-     * Verifies CharSoup correctly picks Shift-JIS when it and GB18030 are 
both candidates.
-     * With v2 model, Mojibuster already ranks Shift-JIS above GB18030 (logit 
~10.5 vs ~6.2).
-     * This test uses Shift-JIS as the higher-confidence candidate to reflect 
that reality.
+     * CharSoup should confirm Shift-JIS even when Mojibuster ranks Big5-HKSCS 
first,
+     * because the language model gives a higher logit to the Japanese text 
decoded
+     * from the same bytes.
      */
     @Test
-    public void charSoupPicksShiftJis() throws Exception {
+    public void charSoupOverridesModelRankingForShiftJis() throws Exception {
+        Charset big5 = Charset.forName("Big5-HKSCS");
         Charset shiftJis = Charset.forName("Shift_JIS");
-        Charset gb18030 = Charset.forName("GB18030");
 
         EncodingDetectorContext ctx = new EncodingDetectorContext();
-        ctx.addResult(List.of(new EncodingResult(shiftJis, 0.6f)), 
"MojibusterEncodingDetector");
-        ctx.addResult(List.of(new EncodingResult(gb18030, 0.5f)), 
"MojibusterEncodingDetector");
+        ctx.addResult(List.of(
+                new EncodingResult(big5,     0.9f, "Big5-HKSCS", 
EncodingResult.ResultType.STATISTICAL),
+                new EncodingResult(shiftJis, 0.3f, "Shift_JIS",  
EncodingResult.ResultType.STATISTICAL)
+        ), "MojibusterEncodingDetector");
 
         ParseContext parseContext = new ParseContext();
         parseContext.set(EncodingDetectorContext.class, ctx);
@@ -134,17 +79,46 @@ public class ZipFilenameDetectionTest {
         CharSoupEncodingDetector charSoup = new CharSoupEncodingDetector();
         try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) {
             List<EncodingResult> result = charSoup.detect(tis, new Metadata(), 
parseContext);
+            assertTrue(!result.isEmpty(), "CharSoup should return a result");
+            assertEquals(shiftJis, result.get(0).getCharset(),
+                    "CharSoup should pick Shift-JIS (文章) over Big5-HKSCS via 
language signal");
+        }
+    }
 
-            System.out.println("\n=== CharSoup arbitration: Shift-JIS(0.6) vs 
GB18030(0.5) ===");
-            System.out.println("arbitration: " + ctx.getArbitrationInfo());
-            if (!result.isEmpty()) {
-                System.out.printf(Locale.ROOT, "winner: %s (conf=%.4f)%n",
-                        result.get(0).getCharset().name(), 
result.get(0).getConfidence());
-                assertEquals(shiftJis, result.get(0).getCharset(),
-                        "CharSoup should confirm Shift-JIS (文章) over GB18030");
-            } else {
-                System.out.println("result: empty — CharSoup abstained 
(Mojibuster winner stands)");
+    /**
+     * Full pipeline (BOM → Metadata → Mojibuster → StandardHtml → CharSoup) 
run
+     * sequentially on two entries differing only in byte 5 (0x31 vs 0x32), 
simulating
+     * what ZipParser does when iterating entries with the same ParseContext.
+     */
+    @Test
+    public void fullPipelineDetectsBothSjisEntries() throws Exception {
+        DefaultEncodingDetector detector = new DefaultEncodingDetector();
+        Metadata parentMeta = new Metadata();
+        ParseContext outerContext = new ParseContext();
+
+        for (byte[] raw : new byte[][]{SJIS_RAW, SJIS_RAW2}) {
+            String label = (raw == SJIS_RAW) ? "文章1.txt" : "文章2.txt";
+            try (TikaInputStream tis = TikaInputStream.get(raw)) {
+                List<EncodingResult> results = detector.detect(tis, 
parentMeta, outerContext);
+                String charset = results.isEmpty() ? "(empty)" : 
results.get(0).getCharset().name();
+                assertTrue(!results.isEmpty() && 
"Shift_JIS".equals(results.get(0).getCharset().name()),
+                        label + " should be detected as Shift_JIS, got: " + 
charset);
             }
         }
     }
+
+    /**
+     * Full pipeline should detect GBK-encoded entry names as GB18030.
+     */
+    @Test
+    public void fullPipelineDetectsGbkEntry() throws Exception {
+        DefaultEncodingDetector detector = new DefaultEncodingDetector();
+        Metadata meta = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(GBK_RAW)) {
+            List<EncodingResult> results = detector.detect(tis, meta, new 
ParseContext());
+            String charset = results.isEmpty() ? "(empty)" : 
results.get(0).getCharset().name();
+            assertTrue(!results.isEmpty() && 
results.get(0).getCharset().name().startsWith("GB"),
+                    "GBK entry should be detected as GB18030/GBK, got: " + 
charset);
+        }
+    }
 }
diff --git a/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml 
b/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml
index e461556a93..36f0f8fd53 100644
--- a/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml
+++ b/tika-encoding-detectors/tika-encoding-detector-universal/pom.xml
@@ -39,6 +39,12 @@
       <artifactId>tika-core</artifactId>
       <version>${revision}</version>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-annotation-processor</artifactId>
+      <version>${revision}</version>
+      <scope>provided</scope>
+    </dependency>
     <dependency>
       <groupId>com.github.albfernandez</groupId>
       <artifactId>juniversalchardet</artifactId>
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
 
b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
index 86d1780926..4e1c2dc1a1 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-universal/src/main/java/org/apache/tika/parser/txt/UniversalEncodingDetector.java
@@ -31,7 +31,7 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 
-@TikaComponent(spi = false)
+@TikaComponent(spi = false, name = "universal-encoding-detector")
 public class UniversalEncodingDetector implements EncodingDetector {
 
     private static final int BUFSIZE = 1024;
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
index 75fccbb806..b408ee1841 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupLanguageDetector.java
@@ -274,19 +274,13 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
     }
 
     /**
-     * Minimum confidence (inverse logit of the max logit) for a candidate to
-     * be considered a genuine language match. If no candidate exceeds this
-     * threshold, the comparison is inconclusive and {@code null} is returned.
-     * <p>
-     * 0.88 corresponds to a raw logit of ~2.0. Typical values:
-     * <ul>
-     *   <li>Arabic (windows-1256): 0.9999994 (logit +14.3)</li>
-     *   <li>UTF-8 garbled: 0.97 (logit +3.5)</li>
-     *   <li>EBCDIC garbage: 0.79 (logit +1.3) — below threshold</li>
-     *   <li>Short English: 0.025 (logit -3.7) — well below threshold</li>
-     * </ul>
+     * The language model's max pre-softmax logit must be positive (sigmoid 
&gt; 0.5)
+     * for a candidate to be considered a genuine language match. A positive 
logit
+     * means the model actively predicts some language as more likely than 
random;
+     * a negative logit means the text is too short, too junk-heavy, or too 
ambiguous
+     * for any language to stand out. When the best candidate's logit is 
positive,
+     * we always return it — the model's relative ordering is the signal we 
trust.
      */
-    private static final float MIN_CONFIDENCE_THRESHOLD = 0.88f;
 
     /**
      * Maximum ratio of junk characters (U+FFFD replacement chars + C0/C1
@@ -307,24 +301,24 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
      * Compare multiple candidate texts and return the key of the one with
      * the strongest language signal. Candidates with a high ratio of
      * replacement or control characters are discarded first. Remaining
-     * candidates are scored using the inverse logit (sigmoid) of the
-     * model's maximum pre-softmax logit.
+     * candidates are scored using the model's maximum pre-softmax logit.
      * <p>
-     * Returns {@code null} if no candidate exceeds the minimum confidence
-     * threshold, indicating the comparison is inconclusive.
+     * The winning candidate is returned if its max logit is positive (sigmoid 
&gt; 0.5),
+     * meaning the model actively predicts some language as more likely than 
random.
+     * Returns {@code null} if the map is empty, all candidates are junk, or 
the
+     * best candidate's logit is non-positive (model has no real signal).
      *
      * @param candidates map of arbitrary keys to candidate text strings
      * @param <K>        key type (e.g., {@link java.nio.charset.Charset})
      * @return the key whose text has the strongest language signal,
-     *         or {@code null} if the map is empty or no candidate is
-     *         confident enough
+     *         or {@code null} if no candidate has a positive language signal
      */
     public <K> K compareLanguageSignal(Map<K, String> candidates) {
         if (candidates.isEmpty()) {
             return null;
         }
 
-        float bestConfidence = Float.NEGATIVE_INFINITY;
+        float bestMaxLogit = Float.NEGATIVE_INFINITY;
         K bestKey = null;
 
         for (Map.Entry<K, String> entry : candidates.entrySet()) {
@@ -337,24 +331,22 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
 
             int[] features = EXTRACTOR.extract(entry.getValue());
             float[] logits = MODEL.predictLogits(features);
-            float confidence = sigmoid(max(logits));
+            float maxLogit = max(logits);
 
-            LOG.debug("compareLanguageSignal: {} -> confidence={}",
-                    entry.getKey(), confidence);
+            LOG.debug("compareLanguageSignal: {} -> maxLogit={}", 
entry.getKey(), maxLogit);
 
-            if (confidence > bestConfidence) {
-                bestConfidence = confidence;
+            if (maxLogit > bestMaxLogit) {
+                bestMaxLogit = maxLogit;
                 bestKey = entry.getKey();
             }
         }
 
-        if (bestConfidence < MIN_CONFIDENCE_THRESHOLD) {
-            LOG.debug("compareLanguageSignal: inconclusive (bestConfidence={} 
< {})",
-                    bestConfidence, MIN_CONFIDENCE_THRESHOLD);
-            return null;
+        if (bestKey != null && bestMaxLogit > 0) {
+            return bestKey;
         }
 
-        return bestKey;
+        LOG.debug("compareLanguageSignal: inconclusive (bestMaxLogit={})", 
bestMaxLogit);
+        return null;
     }
 
     /**
@@ -362,7 +354,7 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
      * with the highest logit, its raw logit value, and sigmoid(maxLogit).
      * Package-private for testing.
      */
-    static float[] maxLogitInfo(String text) {
+    public static float[] maxLogitInfo(String text) {
         int[] features = EXTRACTOR.extract(text);
         float[] logits = MODEL.predictLogits(features);
         int bestIdx = 0;
@@ -375,7 +367,7 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
     }
 
     /** Returns the label for a class index (for use alongside {@link 
#maxLogitInfo}). */
-    static String labelAt(int idx) {
+    public static String labelAt(int idx) {
         return MODEL.getLabel(idx);
     }
 
@@ -383,6 +375,24 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
      * Ratio of junk characters (U+FFFD replacement + ISO control + C1
      * control range U+0080-U+009F) to total characters. High values
      * indicate a wrong-charset decoding.
+     * <p>
+     * TODO: consider also counting non-ASCII, non-alphabetic, non-digit 
characters
+     * (e.g. bullet U+2022, pilcrow U+00B6) as fractional junk (weight ~0.3).
+     * Single-byte encodings like windows-1256 assign punctuation/symbols to 
byte
+     * positions like 0x95 and 0xB6 that multi-byte encodings (Shift_JIS, 
GB18030)
+     * use as lead bytes for alphabetic characters.  When those bytes appear 
in text
+     * that should be meaningful (e.g. filenames), the single-byte 
interpretation
+     * "wastes" bytes on punctuation while the multi-byte interpretation 
yields 100%
+     * alphabetic content.  Counting such punctuation as partial junk would 
lower the
+     * MAX_JUNK_RATIO gate for those candidates and pass the decision to the 
language
+     * model sooner.  Needs careful tuning: legitimate body text can 
intentionally
+     * contain bullet lists, em-dashes, etc. in windows-125x encodings.
+     * Counter-example: Chinese GB18030 bytes decoded as UTF-16 produce pairs
+     * interpreted as Unicode code points — many of which happen to be 
alphabetic
+     * (Unicode has alphabetic characters scattered throughout the range), so
+     * alphabetic yield would look high even for complete mojibake.  The 
language
+     * model already handles this correctly; the alphabetic density heuristic 
alone
+     * would not.
      */
     static float junkRatio(String text) {
         if (text == null || text.isEmpty()) {
@@ -394,8 +404,17 @@ public class CharSoupLanguageDetector extends 
LanguageDetector {
             int cp = text.codePointAt(i);
             i += Character.charCount(cp);
             total++;
-            if (cp == 0xFFFD || Character.isISOControl(cp)) {
+            // U+FFFD = replacement char (wrong-charset decode)
+            // C1 control range 0x80-0x9F = garbage when a byte is decoded as 
ISO-8859-1
+            //   (those code points are never produced by correct Windows-125x 
decoding).
+            // Ordinary whitespace (tab, LF, CR, FF, VT) is not junk — it 
appears in
+            // source code and structured documents regardless of charset.
+            if (cp == 0xFFFD) {
                 junk++;
+            } else if (Character.isISOControl(cp)) {
+                if (cp != 0x09 && cp != 0x0A && cp != 0x0B && cp != 0x0C && cp 
!= 0x0D) {
+                    junk++;
+                }
             }
         }
         return total == 0 ? 0f : (float) junk / total;
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
index 80e71a9bdf..949bd0bc05 100644
--- 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
+++ 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractorTest.java
@@ -46,17 +46,19 @@ public class ByteNgramFeatureExtractorTest {
     }
 
     @Test
-    public void testAsciiOnlyProducesNoFeatures() {
+    public void testAsciiOnlyProducesStride2Features() {
         ByteNgramFeatureExtractor ext = new 
ByteNgramFeatureExtractor(NUM_BUCKETS);
-        // All bytes < 0x80 are skipped — HTML tags, ASCII text, etc. produce 
nothing
+        // Stride-1 skips bytes < 0x80, but stride-2 covers ALL bytes (needed 
for UTF-16/32
+        // null-byte detection). "hello world" (11 bytes) → 5 stride-2 pairs 
at positions
+        // 0,2,4,6,8 → 5 features total.
         byte[] ascii = "hello 
world".getBytes(java.nio.charset.StandardCharsets.US_ASCII);
-        assertEquals(0, sum(ext.extract(ascii)));
+        assertEquals(5, sum(ext.extract(ascii)));
     }
 
     @Test
     public void testSingleHighByteProducesOneUnigram() {
         ByteNgramFeatureExtractor ext = new 
ByteNgramFeatureExtractor(NUM_BUCKETS);
-        // One high byte → one unigram, no bigram (no following byte)
+        // One high byte, no following byte → 1 stride-1 unigram; no stride-2 
pair
         int[] counts = ext.extract(new byte[]{(byte) 0xE0});
         assertEquals(1, sum(counts));
     }
@@ -64,25 +66,31 @@ public class ByteNgramFeatureExtractorTest {
     @Test
     public void testTwoHighBytesProduceUnigramAndBigram() {
         ByteNgramFeatureExtractor ext = new 
ByteNgramFeatureExtractor(NUM_BUCKETS);
-        // 0xE0 → unigram; (0xE0, 0xE1) → bigram; 0xE1 → unigram  = 3 features
+        // Stride-1: unigram(0xE0) + bigram(0xE0,0xE1) + unigram(0xE1) = 3
+        // Stride-2: pair(0xE0,0xE1) at position 0 = 1
+        // Total = 4
         int[] counts = ext.extract(new byte[]{(byte) 0xE0, (byte) 0xE1});
-        assertEquals(3, sum(counts));
+        assertEquals(4, sum(counts));
     }
 
     @Test
-    public void testHighByteFollowedByAsciiProducesUnigramAndBigram() {
+    public void 
testHighByteFollowedByAsciiProducesUnigramBigramAndAnchoredBigram() {
         ByteNgramFeatureExtractor ext = new 
ByteNgramFeatureExtractor(NUM_BUCKETS);
-        // 0xE0 → unigram; (0xE0, 0x41) → bigram; 0x41 is ASCII so no further 
features = 2
+        // Stride-1: unigram(0xE0) + bigram(0xE0,0x41) + 
anchored_bigram(0x41,end) = 3
+        // Stride-2: pair(0xE0,0x41) at position 0 = 1
+        // Total = 4
         int[] counts = ext.extract(new byte[]{(byte) 0xE0, 0x41});
-        assertEquals(2, sum(counts));
+        assertEquals(4, sum(counts));
     }
 
     @Test
-    public void testAsciiFollowedByHighByteProducesUnigramAndBigram() {
+    public void testAsciiFollowedByHighByteProducesUnigramAndStride2() {
         ByteNgramFeatureExtractor ext = new 
ByteNgramFeatureExtractor(NUM_BUCKETS);
-        // 0x41 skipped; 0xE0 → unigram; no following byte → 1 feature
+        // Stride-1: 0x41 skipped; unigram(0xE0), no following byte = 1
+        // Stride-2: pair(0x41,0xE0) at position 0 = 1
+        // Total = 2
         int[] counts = ext.extract(new byte[]{0x41, (byte) 0xE0});
-        assertEquals(1, sum(counts));
+        assertEquals(2, sum(counts));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 493d47d5a9..b01dedded4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -1004,8 +1004,8 @@ public class HtmlParserTest extends TikaTest {
         }
         assertEquals("text/html; charset=UTF-ELEVEN",
                 metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        // "UTF-ELEVEN" is not a valid charset; ML detection returns UTF-8 for 
ASCII content.
-        assertEquals("text/html; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        // "UTF-ELEVEN" is not a valid charset; no declaration available, ML 
defaults to windows-1252.
+        assertEquals("text/html; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
         test = "<html><head><meta http-equiv=\"content-type\" 
content=\"application/pdf\">" +
                 "</head><title>title</title><body>body</body></html>";
@@ -1017,8 +1017,8 @@ public class HtmlParserTest extends TikaTest {
                             metadata, new ParseContext());
         }
         assertEquals("application/pdf", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        // No valid charset declaration; ML detection returns UTF-8 for ASCII 
content.
-        assertEquals("text/html; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        // No valid charset declaration; ML defaults to windows-1252 for pure 
ASCII content.
+        assertEquals("text/html; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
         //test two content values
         test =
@@ -1033,8 +1033,8 @@ public class HtmlParserTest extends TikaTest {
                             metadata, new ParseContext());
         }
         assertEquals("application/pdf", 
metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        // No valid charset declaration; ML detection returns UTF-8 for ASCII 
content.
-        assertEquals("text/html; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        // No valid charset declaration; ML defaults to windows-1252 for pure 
ASCII content.
+        assertEquals("text/html; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
     }
 
     @Test
@@ -1074,8 +1074,8 @@ public class HtmlParserTest extends TikaTest {
 
         assertEquals("text/html; charset=iso-NUMBER_SEVEN",
                 metadata.get(TikaCoreProperties.CONTENT_TYPE_HINT));
-        // "iso-NUMBER_SEVEN" is not a valid charset; ML detection returns 
UTF-8 for ASCII content.
-        assertEquals("application/xhtml+xml; charset=UTF-8",
+        // "iso-NUMBER_SEVEN" is not a valid charset; ML defaults to 
windows-1252 for pure ASCII.
+        assertEquals("application/xhtml+xml; charset=windows-1252",
                 metadata.get(Metadata.CONTENT_TYPE));
 
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
index b0080ee761..883899fa14 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/POIContainerExtractionTest.java
@@ -186,7 +186,8 @@ public class POIContainerExtractionTest extends 
AbstractPOIContainerExtractionTe
         
expected.add("application/vnd.openxmlformats-officedocument.presentationml.presentation");
         expected.add("application/pdf");
         expected.add("application/xml");
-        expected.add("text/plain; charset=ISO-8859-1");
+        // CRLF line endings in this embedded text file trigger the 
ISO→Windows upgrade heuristic
+        expected.add("text/plain; charset=windows-1252");
         //test that we're correctly handling attachment variants for
         // files created by WPS 表格 (https://www.wps.cn/)
         for (String suffix : new String[]{
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
index 0f33218e76..62ec3bc8e3 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/ZipParser.java
@@ -109,6 +109,7 @@ public class ZipParser extends AbstractArchiveParser {
      */
     private static final int MAX_INTEGRITY_CHECK_ENTRIES = 100;
 
+
     private final ZipParserConfig defaultConfig;
 
     private static Set<MediaType> loadZipSpecializations() {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml
index 227ebd31cb..bcc316d361 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/pom.xml
@@ -34,13 +34,11 @@
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-encoding-detector-mojibuster</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-encoding-detector-charsoup</artifactId>
       <version>${project.version}</version>
-      <scope>test</scope>
     </dependency>
     <dependency>
       <groupId>commons-codec</groupId>
@@ -50,6 +48,18 @@
       <groupId>org.apache.commons</groupId>
       <artifactId>commons-csv</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-encoding-detector-icu4j</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-encoding-detector-universal</artifactId>
+      <version>${project.version}</version>
+      <scope>test</scope>
+    </dependency>
     <dependency>
       <groupId>com.google.guava</groupId>
       <artifactId>guava</artifactId>
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
index 67358c5403..a32d063223 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/csv/TextAndCSVParserTest.java
@@ -101,7 +101,7 @@ public class TextAndCSVParserTest extends TikaTest {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER, 
metadata);
         assertEquals("comma", 
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
-        assertMediaTypeEquals("csv", "UTF-8", "comma",
+        assertMediaTypeEquals("csv", "windows-1252", "comma",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
         assertEquals(3, metadata.getInt(TextAndCSVParser.NUM_COLUMNS));
@@ -126,7 +126,7 @@ public class TextAndCSVParserTest extends TikaTest {
         metadata.set(Metadata.CONTENT_TYPE, "text/csv");
         XMLResult xmlResult = getXML(TikaInputStream.get(CSV_UTF8), PARSER, 
metadata);
         assertEquals("comma", 
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
-        assertMediaTypeEquals("csv", "UTF-8", "comma",
+        assertMediaTypeEquals("csv", "windows-1252", "comma",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_CSV, xmlResult.xml);
     }
@@ -160,7 +160,7 @@ public class TextAndCSVParserTest extends TikaTest {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(TikaInputStream.get(TSV_UTF8), PARSER, 
metadata);
         assertEquals("tab", 
xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
-        assertMediaTypeEquals("tsv", "UTF-8", "tab",
+        assertMediaTypeEquals("tsv", "windows-1252", "tab",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContainsIgnoreWhiteSpaceDiffs(EXPECTED_TSV, xmlResult.xml);
     }
@@ -191,7 +191,7 @@ public class TextAndCSVParserTest extends TikaTest {
         metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, "test.csv");
         XMLResult xmlResult = getXML(TikaInputStream.get(csv), PARSER, 
metadata);
         
assertNull(xmlResult.metadata.get(TextAndCSVParser.DELIMITER_PROPERTY));
-        assertEquals("text/plain; charset=UTF-8",
+        assertEquals("text/plain; charset=windows-1252",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
         assertContains("the,quick", xmlResult.xml);
     }
@@ -225,7 +225,7 @@ public class TextAndCSVParserTest extends TikaTest {
         XMLResult xmlResult =
                 
getXML(TikaInputStream.get(sb.toString().getBytes(StandardCharsets.UTF_8)),
                         PARSER, metadata);
-        assertMediaTypeEquals("csv", "UTF-8", "comma",
+        assertMediaTypeEquals("csv", "windows-1252", "comma",
                 xmlResult.metadata.get(Metadata.CONTENT_TYPE));
     }
 
@@ -233,8 +233,7 @@ public class TextAndCSVParserTest extends TikaTest {
     @Test
     public void testSubclassingMimeTypesRemain() throws Exception {
         XMLResult r = getXML("testVCalendar.vcs");
-        // Pure ASCII content — correctly detected as UTF-8
-        assertEquals("text/x-vcalendar; charset=UTF-8", 
r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/x-vcalendar; charset=windows-1252", 
r.metadata.get(Metadata.CONTENT_TYPE));
     }
 
     @Test
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
index a29fb299f1..b02236a4e5 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/txt/TXTParserTest.java
@@ -29,9 +29,11 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.helpers.DefaultHandler;
 
 import org.apache.tika.TikaTest;
+import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
+import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -54,8 +56,8 @@ public class TXTParserTest extends TikaTest {
         }
         String content = writer.toString();
 
-        // Pure ASCII — correctly detected as UTF-8 (ASCII is a subset of 
UTF-8)
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        // Pure ASCII — detected as windows-1252 (the HTML5/WHATWG default for 
8-bit Western)
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
         // TIKA-501: Remove language detection from TXTParser
         assertNull(metadata.get(Metadata.CONTENT_LANGUAGE));
@@ -89,7 +91,7 @@ public class TXTParserTest extends TikaTest {
         try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
             parser.parse(tis, handler, metadata, new ParseContext());
         }
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
         assertEquals("\n", handler.toString());
     }
 
@@ -102,9 +104,9 @@ public class TXTParserTest extends TikaTest {
      */
     @Test
     public void testLatinDetectionHeuristics() throws Exception {
-        // These were previously testing CR/LF heuristics specific to 
UniversalEncodingDetector.
-        // The ML-based detector (MojibusterEncodingDetector + CharSoup) 
correctly identifies
-        // pure-ASCII content as UTF-8 and does not rely on line-ending 
heuristics.
+        // Previously tested CR/LF heuristics specific to 
UniversalEncodingDetector.
+        // The ML-based detector defaults to windows-1252 for pure ASCII 
regardless of
+        // line endings (CRLF_TO_WINDOWS is a secondary confirmation, not the 
primary path).
         String windows = "test\r\n";
         String unix = "test\n";
         String euro = "test \u20ac\n";
@@ -115,15 +117,13 @@ public class TXTParserTest extends TikaTest {
         try (TikaInputStream tis = 
TikaInputStream.get(windows.getBytes("ISO-8859-15"))) {
             parser.parse(tis, new DefaultHandler(), metadata, new 
ParseContext());
         }
-        // Pure ASCII — UTF-8 is correct
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
         metadata = new Metadata();
         try (TikaInputStream tis = 
TikaInputStream.get(unix.getBytes("ISO-8859-15"))) {
             parser.parse(tis, new DefaultHandler(), metadata, new 
ParseContext());
         }
-        // Pure ASCII — UTF-8 is correct
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
         metadata = new Metadata();
         try (TikaInputStream tis = 
TikaInputStream.get(euro.getBytes("ISO-8859-15"))) {
@@ -247,8 +247,7 @@ public class TXTParserTest extends TikaTest {
             parser.parse(tis, new WriteOutContentHandler(writer), metadata, 
new ParseContext());
         }
 
-        // Pure ASCII — UTF-8 is correct
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
     }
 
     /**
@@ -264,11 +263,10 @@ public class TXTParserTest extends TikaTest {
         try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
             parser.parse(tis, new BodyContentHandler(), metadata, new 
ParseContext());
         }
-        // Pure ASCII — UTF-8 is correct
-        assertEquals("text/plain; charset=UTF-8", 
metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadata.get(Metadata.CONTENT_TYPE));
 
-        // Now verify that if we tell the parser the encoding is UTF-8, that's 
what
-        // we get back (see TIKA-868)
+        // TIKA-868: MetadataCharsetDetector (tika-core) reads the charset 
from Content-Type
+        // and returns it as DECLARATIVE, which CharSoup prefers over the 
statistical windows-1252.
         metadata.set(Metadata.CONTENT_TYPE, "application/binary; 
charset=UTF-8");
         try (TikaInputStream tis = TikaInputStream.get(text.getBytes(UTF_8))) {
             parser.parse(tis, new BodyContentHandler(), metadata, new 
ParseContext());
@@ -280,8 +278,23 @@ public class TXTParserTest extends TikaTest {
     @Test
     public void testSubclassingMimeTypesRemain() throws Exception {
         XMLResult r = getXML("testVCalendar.vcs");
-        // Pure ASCII content — correctly detected as UTF-8
-        assertEquals("text/x-vcalendar; charset=UTF-8", 
r.metadata.get(Metadata.CONTENT_TYPE));
+        assertEquals("text/x-vcalendar; charset=windows-1252", 
r.metadata.get(Metadata.CONTENT_TYPE));
+    }
+
+    // TIKA-3516, TIKA-3525, TIKA-1236
+    @Test
+    public void testIgnoreCharset() throws Exception {
+        AutoDetectParser parser = (AutoDetectParser) TikaLoader.load(
+                        getConfigPath(TXTParserTest.class, 
"tika-config-ignore-charset.json"))
+                .loadAutoDetectParser();
+
+        Metadata m = new Metadata();
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+        assertContains("ACTIVE AGE", getXML("testIgnoreCharset.txt", parser, 
m).xml);
+
+        m = new Metadata();
+        m.set(TikaCoreProperties.RESOURCE_NAME_KEY, "texty-text.txt");
+        assertContains("Please check your email", 
getXML("test_ignore_IBM420.html", parser, m).xml);
     }
 
 }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 73928000bb..31421a12c9 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -32,12 +32,15 @@ import org.junit.jupiter.api.Test;
 import org.apache.tika.TikaLoaderHelper;
 import org.apache.tika.TikaTest;
 import org.apache.tika.config.loader.TikaLoader;
+import org.apache.tika.detect.BOMDetector;
 import org.apache.tika.detect.CompositeEncodingDetector;
 import org.apache.tika.detect.EncodingDetector;
 import org.apache.tika.detect.MetaEncodingDetector;
+import org.apache.tika.detect.MetadataCharsetDetector;
 import org.apache.tika.detect.OverrideEncodingDetector;
 import org.apache.tika.exception.TikaConfigException;
 import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.langdetect.charsoup.CharSoupEncodingDetector;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.ml.chardetect.MojibusterEncodingDetector;
@@ -56,13 +59,15 @@ public class TikaEncodingDetectorTest extends TikaTest {
         EncodingDetector detector = 
TikaLoader.loadDefault().loadEncodingDetectors();
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors = ((CompositeEncodingDetector) 
detector).getDetectors();
-        // 2 base detectors (ML, StandardHtml) + CharSoupEncodingDetector 
(MetaEncodingDetector)
-        assertEquals(3, detectors.size());
+        // 4 base detectors (BOM, Metadata, ML, StandardHtml) + 
CharSoupEncodingDetector (MetaEncodingDetector)
+        assertEquals(5, detectors.size());
         // meta detector is always last (partitioned by 
CompositeEncodingDetector)
-        assertTrue(detectors.get(2) instanceof MetaEncodingDetector);
+        assertTrue(detectors.get(4) instanceof MetaEncodingDetector);
         // base detectors — sorted by full class name; check by type
-        Set<Class<?>> baseClasses = detectors.subList(0, 2).stream()
+        Set<Class<?>> baseClasses = detectors.subList(0, 4).stream()
                 .map(Object::getClass).collect(Collectors.toSet());
+        assertTrue(baseClasses.contains(BOMDetector.class));
+        assertTrue(baseClasses.contains(MetadataCharsetDetector.class));
         assertTrue(baseClasses.contains(MojibusterEncodingDetector.class));
         assertTrue(baseClasses.contains(StandardHtmlEncodingDetector.class));
     }
@@ -81,12 +86,14 @@ public class TikaEncodingDetectorTest extends TikaTest {
         assertTrue(detector1 instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors1Children =
                 ((CompositeEncodingDetector) detector1).getDetectors();
-        // ML base detector + CharSoup meta (html excluded)
-        assertEquals(2, detectors1Children.size());
-        Set<Class<?>> innerClasses = detectors1Children.subList(0, 1).stream()
+        // BOM + Metadata + ML base detectors + CharSoup meta (html excluded)
+        assertEquals(4, detectors1Children.size());
+        Set<Class<?>> innerClasses = detectors1Children.subList(0, 3).stream()
                 .map(Object::getClass).collect(Collectors.toSet());
+        assertTrue(innerClasses.contains(BOMDetector.class));
+        assertTrue(innerClasses.contains(MetadataCharsetDetector.class));
         assertTrue(innerClasses.contains(MojibusterEncodingDetector.class));
-        assertTrue(detectors1Children.get(1) instanceof MetaEncodingDetector);
+        assertTrue(detectors1Children.get(3) instanceof MetaEncodingDetector);
 
         assertTrue(detectors.get(1) instanceof OverrideEncodingDetector);
 
@@ -178,9 +185,9 @@ public class TikaEncodingDetectorTest extends TikaTest {
                     ((AbstractEncodingDetectorParser) encodingDetectingParser)
                             .getEncodingDetector();
             assertTrue(encodingDetector instanceof CompositeEncodingDetector);
-            // ML, Html base detectors + CharSoup MetaEncodingDetector
+            // BOM, Metadata, ML, Html base detectors + CharSoup 
MetaEncodingDetector
             // (ICU4J is excluded but was already not in the default chain)
-            assertEquals(3, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
+            assertEquals(5, ((CompositeEncodingDetector) 
encodingDetector).getDetectors().size());
             for (EncodingDetector child : ((CompositeEncodingDetector) 
encodingDetector)
                     .getDetectors()) {
                 assertNotContained("cu4j", 
child.getClass().getCanonicalName());
@@ -207,14 +214,17 @@ public class TikaEncodingDetectorTest extends TikaTest {
             assertTrue(encodingDetector instanceof CompositeEncodingDetector);
             List<EncodingDetector> children =
                     ((CompositeEncodingDetector) 
encodingDetector).getDetectors();
-            assertEquals(3, children.size(), 
childParser.getClass().toString());
+            // 3 base detectors + 1 MetaEncodingDetector (CharSoup) = 4 total
+            assertEquals(4, children.size(), 
childParser.getClass().toString());
             assertTrue(children.get(0) instanceof MojibusterEncodingDetector,
                     childParser.getClass().toString());
             HtmlEncodingDetector htmlDet = (HtmlEncodingDetector) 
children.get(1);
-            assertEquals(64000, htmlDet.getDefaultConfig().getMarkLimit(),
+            assertEquals(100000, htmlDet.getDefaultConfig().getMarkLimit(),
                     childParser.getClass().toString());
             assertTrue(children.get(2) instanceof StandardHtmlEncodingDetector,
                     childParser.getClass().toString());
+            assertTrue(children.get(3) instanceof CharSoupEncodingDetector,
+                    childParser.getClass().toString());
         }
     }
 
@@ -222,7 +232,8 @@ public class TikaEncodingDetectorTest extends TikaTest {
     public void testMarkLimitIntegration() throws Exception {
         StringBuilder sb = new StringBuilder();
         sb.append("<html><head><script>");
-        for (int i = 0; i < 4000; i++) { //script length = 20000
+        // script length = ~80000 bytes, beyond the default mark limit of 65536
+        for (int i = 0; i < 16000; i++) {
             sb.append("blah ");
         }
         sb.append("</script>");
@@ -233,19 +244,23 @@ public class TikaEncodingDetectorTest extends TikaTest {
 
         byte[] bytes = sb.toString().getBytes(StandardCharsets.UTF_8);
 
-        // The new pipeline (StandardHtmlEncodingDetector reads past the 
script block
-        // and finds the meta charset) correctly detects UTF-8 even by default.
+        // Default: the meta charset is buried at ~byte 80,000, past the 
default
+        // mark limit of 65536. The detector falls back to windows-1252 for the
+        // pure-ASCII probe. HTML entities (&#248;) render correctly 
regardless;
+        // raw UTF-8 multibyte sequences (e.g. ø in "økologisk") are garbled.
+        // Raise the mark limit via config to fix this (see below).
         Parser p = AUTO_DETECT_PARSER;
 
         Metadata metadata = new Metadata();
         String xml = getXML(TikaInputStream.get(bytes), p, metadata).xml;
 
-        assertContains("gr\u00F8nd", xml);
-        assertContains("\u00f8kologisk", xml);
-        assertContains("gr\u00F8nt", xml);
-        assertContains("g\u00E5 til", xml);
+        assertContains("gr\u00F8nd", xml);       // &#248; entity — correct 
regardless
+        assertNotContained("\u00f8kologisk", xml); // raw UTF-8 bytes — 
garbled by default
+        assertNotContained("gr\u00F8nt", xml);
+        assertNotContained("g\u00E5 til", xml);
 
-        //now test that fix works
+        // With a raised mark limit the detector reaches the meta charset and
+        // correctly decodes UTF-8 content.
         p = 
TikaLoaderHelper.getLoader("TIKA-2485-encoding-detector-mark-limits.json").loadAutoDetectParser();
 
         metadata = new Metadata();
@@ -266,10 +281,12 @@ public class TikaEncodingDetectorTest extends TikaTest {
         assertTrue(detector instanceof CompositeEncodingDetector);
         List<EncodingDetector> detectors =
                 ((CompositeEncodingDetector) detector).getDetectors();
-        // 2 base detectors (ML + StandardHtml), no MetaEncodingDetector
-        assertEquals(2, detectors.size());
+        // 4 base detectors (BOM + Metadata + ML + StandardHtml), no 
MetaEncodingDetector
+        assertEquals(4, detectors.size());
         Set<Class<?>> excludedCharSoupClasses = detectors.stream()
                 .map(Object::getClass).collect(Collectors.toSet());
+        assertTrue(excludedCharSoupClasses.contains(BOMDetector.class));
+        
assertTrue(excludedCharSoupClasses.contains(MetadataCharsetDetector.class));
         
assertTrue(excludedCharSoupClasses.contains(MojibusterEncodingDetector.class));
         
assertTrue(excludedCharSoupClasses.contains(StandardHtmlEncodingDetector.class));
         for (EncodingDetector d : detectors) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index 3e05518ebc..8f9b957e90 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -55,7 +55,7 @@ public class AutoDetectParserTest extends TikaTest {
     // Easy to read constants for the MIME types:
     private static final String RAW = "application/octet-stream";
     private static final String EXCEL = "application/vnd.ms-excel";
-    private static final String HTML = "text/html; charset=UTF-8";
+    private static final String HTML = "text/html; charset=windows-1252";
     private static final String PDF = "application/pdf";
     private static final String POWERPOINT = "application/vnd.ms-powerpoint";
     private static final String KEYNOTE = "application/vnd.apple.keynote";
@@ -63,7 +63,7 @@ public class AutoDetectParserTest extends TikaTest {
     private static final String NUMBERS = "application/vnd.apple.numbers";
     private static final String CHM = "application/vnd.ms-htmlhelp";
     private static final String RTF = "application/rtf";
-    private static final String PLAINTEXT = "text/plain; charset=UTF-8";
+    private static final String PLAINTEXT = "text/plain; charset=windows-1252";
     private static final String UTF8TEXT = "text/plain; charset=UTF-8";
     private static final String WORD = "application/msword";
     private static final String XML = "application/xml";
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
index 1411481f1c..7bd45cf813 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/microsoft/rtf/RTFParserTest.java
@@ -46,11 +46,11 @@ public class RTFParserTest extends TikaTest {
     public void testEmbeddedMonster() throws Exception {
 
         Map<Integer, Pair> expected = new HashMap<>();
-        expected.put(3, new Pair("Hw.txt", "text/plain; charset=UTF-8"));
+        expected.put(3, new Pair("Hw.txt", "text/plain; 
charset=windows-1252"));
         expected.put(4, new Pair("file_0.doc", "application/msword"));
         expected.put(7, new Pair("file_1.xlsx",
                 
"application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"));
-        expected.put(10, new Pair("text.html", "text/html; charset=UTF-8"));
+        expected.put(10, new Pair("text.html", "text/html; 
charset=windows-1252"));
         expected.put(11, new Pair("html-within-zip.zip", "application/zip"));
         expected.put(12,
                 new Pair("test-zip-of-zip_\u666E\u6797\u65AF\u987F.zip", 
"application/zip"));
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index b7565f11f2..76910c56ff 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -252,7 +252,7 @@ public class PDFParserTest extends TikaTest {
                 metadatas.get(1).get(Metadata.CONTENT_TYPE));
         assertImageContentType("image/tiff",
                 metadatas.get(2).get(Metadata.CONTENT_TYPE));
-        assertEquals("text/plain; charset=UTF-8", 
metadatas.get(3).get(Metadata.CONTENT_TYPE));
+        assertEquals("text/plain; charset=windows-1252", 
metadatas.get(3).get(Metadata.CONTENT_TYPE));
         assertEquals(TYPE_DOC.toString(), 
metadatas.get(4).get(Metadata.CONTENT_TYPE));
     }
 
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
index 785206f8da..bec188b8d8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/PackageParserTest.java
@@ -18,7 +18,6 @@ package org.apache.tika.parser.pkg;
 
 import java.util.List;
 
-import org.junit.jupiter.api.Disabled;
 import org.junit.jupiter.api.Test;
 
 import org.apache.tika.TikaTest;
@@ -33,7 +32,6 @@ public class PackageParserTest extends TikaTest {
         assertContains("审计压缩", 
metadataList.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
     }
 
-    @Disabled("TIKA-4662: ML model confuses Shift_JIS with Big5 on 9-byte zip 
entry name probes; needs model improvement")
     @Test
     public void handleEntryNameWithCharsetShiftJIS() throws Exception {
         List<Metadata> metadataList = 
getRecursiveMetadata("testZipEntryNameCharsetShiftSJIS.zip");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
index 6da5365b70..8275da4bfc 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2485-encoding-detector-mark-limits.json
@@ -5,11 +5,16 @@
     },
     {
       "html-encoding-detector": {
-        "markLimit": 64000
+        "markLimit": 100000
       }
     },
     {
-      "standard-html-encoding-detector": {}
+      "standard-html-encoding-detector": {
+        "markLimit": 100000
+      }
+    },
+    {
+      "charsoup-encoding-detector": {}
     }
   ]
 }

(tika) 09/09: chardet -

Reply via email to