(tika) 02/03: split strides into separate model space

tallison Mon, 13 Apr 2026 10:40:23 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch charset-detection-improvements
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 4b6c3b60e2053681325ccfcca39752b4ceafed82
Author: tallison <[email protected]>
AuthorDate: Mon Apr 13 13:22:32 2026 -0400

    split strides into separate model space
---
 .../ml/chardetect/ByteNgramFeatureExtractor.java   | 145 ++++++-
 .../ml/chardetect/tools/BucketCollisionAudit.java  | 459 +++++++++++++++++++++
 .../ConfigurableByteNgramFeatureExtractor.java     | 105 +++--
 .../ml/chardetect/tools/TraceCharsetLogits.java    | 380 +++++++++++++++++
 .../ml/chardetect/tools/TrainCharsetModel.java     |  14 +-
 .../chardetect/ConfigurableGlobalFeatureTest.java  |  73 ++++
 .../ml/chardetect/FeatureExtractorParityTest.java  |  97 +++++
 7 files changed, 1232 insertions(+), 41 deletions(-)

diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
index 1dfb9cfe23..baa67fbc47 100644
--- 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java
@@ -93,22 +93,121 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
     /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 
hashes. */
     private static final int FNV_STRIDE2_SALT = 0x9e3779b9;
 
+    /**
+     * Number of reserved slots at the high end of the feature vector used for
+     * global (whole-probe) features when {@link #useGlobalFeatures} is 
enabled.
+     * Currently 6 slots hold ASCII-text-density bins (see
+     * {@link #asciiDensityBin(byte[])}).  Must match the training-side
+     * {@code ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT}.
+     */
+    public static final int GLOBAL_FEATURE_COUNT = 6;
+
     private final int numBuckets;
+    private final int stride1Buckets;
+    private final int stride2Buckets;
+    private final int stride2Base;
+    private final int globalBase;
+    private final boolean useGlobalFeatures;
+    private final boolean useSplitSpaces;
 
     /**
-     * Create an extractor with the production feature set (UBT-: unigrams +
-     * bigrams + trigrams, no anchored bigrams) and the given bucket count.
-     * The bucket count must match the model the extractor will be paired with 
—
-     * in practice this is read from the model binary via
-     * {@link org.apache.tika.ml.LinearModel#getNumBuckets()}.
+     * Legacy constructor: no globals, shared stride-1/stride-2 hash space.
+     * Matches the layout used by the shipped {@code 
chardetect-v6-no-utf32.bin}.
      *
      * @param numBuckets number of hash buckets (feature-vector dimension)
      */
     public ByteNgramFeatureExtractor(int numBuckets) {
+        this(numBuckets, false, false);
+    }
+
+    /**
+     * Create an extractor matching the layout of a trained model.
+     *
+     * @param numBuckets         total feature-vector dimension.
+     * @param useGlobalFeatures  reserve the last {@link #GLOBAL_FEATURE_COUNT}
+     *                           slots for ASCII-density bin features.
+     * @param useSplitSpaces     split the hash space 50/50 between stride-1
+     *                           features (low half) and stride-2 features
+     *                           (high half) so cross-family hash collisions
+     *                           cannot pollute single-byte-charset weights
+     *                           with stride-2 signals.
+     */
+    public ByteNgramFeatureExtractor(int numBuckets,
+                                     boolean useGlobalFeatures,
+                                     boolean useSplitSpaces) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException("numBuckets must be positive: " 
+ numBuckets);
         }
+        int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0;
+        int hashSpace = numBuckets - globalsReserved;
+        if (hashSpace <= 0) {
+            throw new IllegalArgumentException(
+                    "numBuckets must exceed GLOBAL_FEATURE_COUNT when 
useGlobalFeatures=true: "
+                            + numBuckets);
+        }
+        if (useSplitSpaces && hashSpace < 2) {
+            throw new IllegalArgumentException(
+                    "useSplitSpaces requires hashSpace >= 2: " + hashSpace);
+        }
         this.numBuckets = numBuckets;
+        this.useSplitSpaces = useSplitSpaces;
+        this.useGlobalFeatures = useGlobalFeatures;
+        if (useSplitSpaces) {
+            this.stride1Buckets = hashSpace / 2;
+            this.stride2Buckets = hashSpace - this.stride1Buckets;
+            this.stride2Base = this.stride1Buckets;
+        } else {
+            this.stride1Buckets = hashSpace;
+            this.stride2Buckets = hashSpace;
+            this.stride2Base = 0;
+        }
+        this.globalBase = hashSpace;
+    }
+
+    /**
+     * Returns which ASCII-text-density bin this probe falls into, in [0, 6).
+     * Must match the training-side
+     * {@code ConfigurableByteNgramFeatureExtractor.asciiDensityBin}.
+     *
+     * <p>Bin layout (fraction of bytes that are ASCII-text: printable
+     * {@code 0x20..0x7E} plus {@code 0x09 0x0A 0x0D}):</p>
+     * <ul>
+     *   <li>0: [0.00, 0.10)</li>
+     *   <li>1: [0.10, 0.50)</li>
+     *   <li>2: [0.50, 0.80)</li>
+     *   <li>3: [0.80, 0.95)</li>
+     *   <li>4: [0.95, 0.99)</li>
+     *   <li>5: [0.99, 1.00]</li>
+     * </ul>
+     */
+    public static int asciiDensityBin(byte[] input) {
+        if (input == null || input.length == 0) {
+            return 5;
+        }
+        int asciiText = 0;
+        for (byte b : input) {
+            int v = b & 0xFF;
+            if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 
0x0D) {
+                asciiText++;
+            }
+        }
+        double p = (double) asciiText / input.length;
+        if (p < 0.10) {
+            return 0;
+        }
+        if (p < 0.50) {
+            return 1;
+        }
+        if (p < 0.80) {
+            return 2;
+        }
+        if (p < 0.95) {
+            return 3;
+        }
+        if (p < 0.99) {
+            return 4;
+        }
+        return 5;
     }
 
     @Override
@@ -166,7 +265,7 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
 
             // Unigram
             int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-            int bkt = (h & 0x7fffffff) % numBuckets;
+            int bkt = stride1Bucket(h);
             if (dense[bkt] == 0) {
                 touched[n++] = bkt;
             }
@@ -178,7 +277,7 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
                 // Bigram
                 h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                 h = (h ^ bi1) * FNV_PRIME;
-                bkt = (h & 0x7fffffff) % numBuckets;
+                bkt = stride1Bucket(h);
                 if (dense[bkt] == 0) {
                     touched[n++] = bkt;
                 }
@@ -193,7 +292,16 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
             int b1 = input[i + 1] & 0xFF;
             int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
             h = (h ^ b1) * FNV_PRIME;
-            int bkt = (h & 0x7fffffff) % numBuckets;
+            int bkt = stride2Bucket(h);
+            if (dense[bkt] == 0) {
+                touched[n++] = bkt;
+            }
+            dense[bkt]++;
+        }
+
+        // Global features: fire exactly one ASCII-density bin.
+        if (useGlobalFeatures) {
+            int bkt = globalBase + asciiDensityBin(input);
             if (dense[bkt] == 0) {
                 touched[n++] = bkt;
             }
@@ -212,7 +320,7 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
             }
 
             // Unigram
-            counts[bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++;
+            counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++;
 
             if (i + 1 < to) {
                 int bi1 = b[i + 1] & 0xFF;
@@ -220,7 +328,7 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
                 // Bigram
                 int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                 h = (h ^ bi1) * FNV_PRIME;
-                counts[bucket(h)]++;
+                counts[stride1Bucket(h)]++;
             }
         }
 
@@ -230,12 +338,23 @@ public class ByteNgramFeatureExtractor implements 
FeatureExtractor<byte[]> {
             int b1 = b[i + 1] & 0xFF;
             int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
             h = (h ^ b1) * FNV_PRIME;
-            counts[bucket(h)]++;
+            counts[stride2Bucket(h)]++;
         }
+
+        // Global features: fire exactly one ASCII-density bin.
+        if (useGlobalFeatures) {
+            byte[] slice = (from == 0 && to == b.length)
+                    ? b : java.util.Arrays.copyOfRange(b, from, to);
+            counts[globalBase + asciiDensityBin(slice)]++;
+        }
+    }
+
+    private int stride1Bucket(int hash) {
+        return (hash & 0x7fffffff) % stride1Buckets;
     }
 
-    private int bucket(int hash) {
-        return (hash & 0x7fffffff) % numBuckets;
+    private int stride2Bucket(int hash) {
+        return stride2Base + (hash & 0x7fffffff) % stride2Buckets;
     }
 
     @Override
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
new file mode 100644
index 0000000000..35a9fcd5cf
--- /dev/null
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java
@@ -0,0 +1,459 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect.tools;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.ml.LinearModel;
+import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor;
+
+/**
+ * Audits hash-bucket collisions for the shipped feature extractor.  For a
+ * given probe, shows which n-grams fired which buckets, and for each bucket
+ * lists every OTHER n-gram in the extractor's n-gram space that would hash
+ * to the same bucket.  Optionally restricts the "colliding peers" enumeration
+ * to specific byte-range classes (Arabic vs Central European letters, etc.).
+ *
+ * <p>Usage:
+ * <pre>
+ *   java BucketCollisionAudit --probe &lt;file&gt; [--model &lt;path&gt;]
+ *                             [--max-probe-bytes N] [--top N]
+ * </pre>
+ *
+ * <p>Uses the exact FNV constants from {@link ByteNgramFeatureExtractor}.
+ * Enumerates four feature families:
+ * <ul>
+ *   <li>Unigrams — one byte in 0x80..0xFF (128 entries)</li>
+ *   <li>Bigrams — high byte then any byte (128 * 256 = 32,768 entries)</li>
+ *   <li>Anchored bigrams — one salt, (low-trail, any) byte pairs
+ *       (128 * 256 = 32,768 entries, only those following a high byte)</li>
+ *   <li>Stride-2 bigrams — (any, any) at even positions (256 * 256 = 65,536 
entries)</li>
+ * </ul>
+ */
+public final class BucketCollisionAudit {
+
+    private static final int FNV_PRIME        = 0x01000193;
+    private static final int FNV_OFFSET       = 0x811c9dc5;
+    private static final int FNV_ANCHOR_SALT  = 0x27d4eb2f;
+    private static final int FNV_STRIDE2_SALT = 0x9e3779b9;
+
+    private BucketCollisionAudit() {
+    }
+
+    public static void main(String[] args) throws Exception {
+        Path probePath = null;
+        Path modelPath = null;
+        int maxProbeBytes = 32 * 1024;
+        int topBuckets = 20;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--probe":
+                    probePath = Paths.get(args[++i]);
+                    break;
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--max-probe-bytes":
+                    maxProbeBytes = Integer.parseInt(args[++i]);
+                    break;
+                case "--top":
+                    topBuckets = Integer.parseInt(args[++i]);
+                    break;
+                default:
+                    System.err.println("Unknown arg: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        if (probePath == null) {
+            System.err.println("Usage: BucketCollisionAudit --probe <file> 
[--model <path>] "
+                    + "[--max-probe-bytes N] [--top N]");
+            System.exit(1);
+        }
+
+        LinearModel model = loadModel(modelPath);
+        int numBuckets = model.getNumBuckets();
+        ByteNgramFeatureExtractor extractor = new 
ByteNgramFeatureExtractor(numBuckets);
+
+        // Pre-build inverse map: bucket -> list of n-grams that hash to it.
+        System.out.printf(Locale.ROOT,
+                "Building inverse bucket map over %,d buckets (can take a few 
seconds)...%n",
+                numBuckets);
+        List<Ngram>[] inverse = buildInverseMap(numBuckets);
+
+        // Collision-rate summary.
+        int maxSize = 0;
+        long totalNgrams = 0;
+        int populated = 0;
+        for (List<Ngram> l : inverse) {
+            if (l == null || l.isEmpty()) {
+                continue;
+            }
+            populated++;
+            totalNgrams += l.size();
+            if (l.size() > maxSize) {
+                maxSize = l.size();
+            }
+        }
+        double avg = populated > 0 ? (double) totalNgrams / populated : 0;
+        System.out.printf(Locale.ROOT,
+                "n-grams enumerated: %,d   populated buckets: %,d / %,d 
(%.1f%%)   "
+                        + "avg n-grams/bucket: %.2f   max: %d%n%n",
+                totalNgrams, populated, numBuckets,
+                100.0 * populated / numBuckets, avg, maxSize);
+
+        // Load probe, extract features.
+        byte[] all = Files.readAllBytes(probePath);
+        byte[] probe = all.length <= maxProbeBytes ? all : Arrays.copyOf(all, 
maxProbeBytes);
+        int[] features = extractor.extract(probe);
+
+        int nnz = 0;
+        for (int v : features) {
+            if (v != 0) {
+                nnz++;
+            }
+        }
+        System.out.printf(Locale.ROOT,
+                "Probe %s: %,d bytes (probe: %,d), %,d active buckets%n%n",
+                probePath, all.length, probe.length, nnz);
+
+        // For the top-N hottest buckets (by count), show which of this probe's
+        // n-grams fired them, and list every OTHER n-gram that hashes to the
+        // same bucket.
+        Integer[] order = new Integer[numBuckets];
+        for (int i = 0; i < numBuckets; i++) {
+            order[i] = i;
+        }
+        Arrays.sort(order, Comparator.comparingInt((Integer i) -> 
-features[i]));
+
+        // Compute which n-grams from THIS probe fired each bucket (with 
occurrences).
+        Map<Integer, List<Ngram>> probeFirings = new LinkedHashMap<>();
+        enumerateProbeFirings(probe, numBuckets, probeFirings);
+
+        byte[][] weights = model.getWeights();
+        float[] scales = model.getScales();
+        String[] labels = model.getLabels();
+
+        int ibm852 = indexOf(labels, "IBM852");
+        int win1256 = indexOf(labels, "windows-1256");
+        int win1250 = indexOf(labels, "windows-1250");
+
+        System.out.printf(Locale.ROOT, "Top-%d hottest buckets on this 
probe:%n", topBuckets);
+        
System.out.println("====================================================================");
+        int shown = 0;
+        for (int rank = 0; rank < numBuckets && shown < topBuckets; rank++) {
+            int b = order[rank];
+            if (features[b] == 0) {
+                break;
+            }
+            shown++;
+            String ibm852Col = col(weights, scales, b, ibm852, features[b]);
+            String win1256Col = col(weights, scales, b, win1256, features[b]);
+            String win1250Col = col(weights, scales, b, win1250, features[b]);
+            System.out.printf(Locale.ROOT,
+                    "Bucket %5d   count %3d   IBM852:%s   win-1256:%s   
win-1250:%s%n",
+                    b, features[b], ibm852Col, win1256Col, win1250Col);
+            List<Ngram> fired = probeFirings.getOrDefault(b, new 
ArrayList<>());
+            List<Ngram> allHere = inverse[b];
+            System.out.printf(Locale.ROOT,
+                    "  fired by probe (%d distinct ngram kinds):%n", 
fired.size());
+            for (Ngram ng : fired) {
+                System.out.println("    " + ng.describe());
+            }
+            System.out.printf(Locale.ROOT,
+                    "  other n-grams colliding into this bucket (%d total):%n",
+                    allHere == null ? 0 : allHere.size() - fired.size());
+            if (allHere != null) {
+                int samples = 0;
+                for (Ngram ng : allHere) {
+                    if (containsSame(fired, ng)) {
+                        continue;
+                    }
+                    if (samples++ >= 8) {
+                        break;
+                    }
+                    System.out.println("    " + ng.describe());
+                }
+            }
+            System.out.println();
+        }
+    }
+
+    private static String col(byte[][] weights, float[] scales, int bucket,
+                              int cls, int count) {
+        if (cls < 0) {
+            return "(n/a)";
+        }
+        int w = weights[cls][bucket];
+        float raw = scales[cls] * w * count;
+        return String.format(Locale.ROOT, "w=%+4d raw=%+7.1f", w, raw);
+    }
+
+    private static int indexOf(String[] labels, String target) {
+        for (int i = 0; i < labels.length; i++) {
+            if (labels[i].equalsIgnoreCase(target)) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    private static boolean containsSame(List<Ngram> list, Ngram ng) {
+        for (Ngram o : list) {
+            if (o.equalsNgram(ng)) {
+                return true;
+            }
+        }
+        return false;
+    }
+
+    private static LinearModel loadModel(Path modelPath) throws Exception {
+        if (modelPath != null) {
+            return LinearModel.loadFromPath(modelPath);
+        }
+        String res = 
"/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin";
+        try (InputStream is = 
BucketCollisionAudit.class.getResourceAsStream(res)) {
+            if (is == null) {
+                throw new IllegalStateException("default model resource not 
found: " + res);
+            }
+            return LinearModel.load(is);
+        }
+    }
+
+    // ----------------------------------------------------------------------
+    // N-gram enumeration and hashing
+    // ----------------------------------------------------------------------
+
+    private static int bucket(int hash, int numBuckets) {
+        return (hash & 0x7fffffff) % numBuckets;
+    }
+
+    private static int hashUnigram(int bi) {
+        return (FNV_OFFSET ^ bi) * FNV_PRIME;
+    }
+
+    private static int hashBigram(int bi, int bi1) {
+        int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
+        return (h ^ bi1) * FNV_PRIME;
+    }
+
+    private static int hashAnchored(int lowTrail, int next) {
+        int h = (FNV_ANCHOR_SALT ^ lowTrail) * FNV_PRIME;
+        return (h ^ next) * FNV_PRIME;
+    }
+
+    private static int hashAnchoredNoTrail(int lowTrail) {
+        // When the low-trail is the last byte in the probe, anchored bigram
+        // has no 'next' — the extractor emits just the hash seeded with 
lowTrail.
+        return (FNV_ANCHOR_SALT ^ lowTrail) * FNV_PRIME;
+    }
+
+    private static int hashStride2(int b0, int b1) {
+        int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
+        return (h ^ b1) * FNV_PRIME;
+    }
+
+    @SuppressWarnings("unchecked")
+    private static List<Ngram>[] buildInverseMap(int numBuckets) {
+        List<Ngram>[] inverse = new List[numBuckets];
+
+        // Unigrams: high bytes only.
+        for (int bi = 0x80; bi < 0x100; bi++) {
+            add(inverse, bucket(hashUnigram(bi), numBuckets), 
Ngram.unigram(bi));
+        }
+        // Bigrams: (high, any).
+        for (int bi = 0x80; bi < 0x100; bi++) {
+            for (int bi1 = 0; bi1 < 0x100; bi1++) {
+                add(inverse, bucket(hashBigram(bi, bi1), numBuckets),
+                        Ngram.bigram(bi, bi1));
+            }
+        }
+        // Anchored: (low-trail, any) — only fires when preceded by a high 
byte.
+        // Hash doesn't include the precursor; two variants depending on 
whether
+        // a 'next' byte exists.
+        for (int bi1 = 0; bi1 < 0x80; bi1++) {
+            add(inverse, bucket(hashAnchoredNoTrail(bi1), numBuckets),
+                    Ngram.anchoredNoNext(bi1));
+            for (int bi2 = 0; bi2 < 0x100; bi2++) {
+                add(inverse, bucket(hashAnchored(bi1, bi2), numBuckets),
+                        Ngram.anchored(bi1, bi2));
+            }
+        }
+        // Stride-2: (any, any).
+        for (int b0 = 0; b0 < 0x100; b0++) {
+            for (int b1 = 0; b1 < 0x100; b1++) {
+                add(inverse, bucket(hashStride2(b0, b1), numBuckets),
+                        Ngram.stride2(b0, b1));
+            }
+        }
+        return inverse;
+    }
+
+    private static void add(List<Ngram>[] inv, int b, Ngram ng) {
+        if (inv[b] == null) {
+            inv[b] = new ArrayList<>();
+        }
+        inv[b].add(ng);
+    }
+
+    /**
+     * For a given probe, walk the exact same emission logic as
+     * {@link ByteNgramFeatureExtractor#extractSparseInto} and record, per
+     * bucket, which n-gram(s) fired it.  This is needed because the
+     * inverse map gives us the universe of potentially-colliding n-grams,
+     * and we want to separate "this probe fired it via X" from
+     * "X' is a colliding peer that didn't fire here."
+     */
+    private static void enumerateProbeFirings(byte[] input, int numBuckets,
+                                              Map<Integer, List<Ngram>> 
firings) {
+        // Stride-1
+        for (int i = 0; i < input.length; i++) {
+            int bi = input[i] & 0xFF;
+            if (bi < 0x80) {
+                continue;
+            }
+            addFiring(firings, bucket(hashUnigram(bi), numBuckets), 
Ngram.unigram(bi));
+            if (i + 1 < input.length) {
+                int bi1 = input[i + 1] & 0xFF;
+                addFiring(firings, bucket(hashBigram(bi, bi1), numBuckets),
+                        Ngram.bigram(bi, bi1));
+                if (bi1 < 0x80) {
+                    if (i + 2 < input.length) {
+                        int bi2 = input[i + 2] & 0xFF;
+                        addFiring(firings, bucket(hashAnchored(bi1, bi2), 
numBuckets),
+                                Ngram.anchored(bi1, bi2));
+                    } else {
+                        addFiring(firings, bucket(hashAnchoredNoTrail(bi1), 
numBuckets),
+                                Ngram.anchoredNoNext(bi1));
+                    }
+                }
+            }
+        }
+        // Stride-2
+        for (int i = 0; i + 1 < input.length; i += 2) {
+            int b0 = input[i] & 0xFF;
+            int b1 = input[i + 1] & 0xFF;
+            addFiring(firings, bucket(hashStride2(b0, b1), numBuckets),
+                    Ngram.stride2(b0, b1));
+        }
+    }
+
+    private static void addFiring(Map<Integer, List<Ngram>> firings, int b, 
Ngram ng) {
+        List<Ngram> list = firings.computeIfAbsent(b, k -> new ArrayList<>());
+        for (Ngram o : list) {
+            if (o.equalsNgram(ng)) {
+                return;
+            }
+        }
+        list.add(ng);
+    }
+
+    private static final class Ngram {
+        final char kind;  // 'U' 'B' 'A' 'a' (anchored-no-next) 'S'
+        final int a;
+        final int b;
+
+        Ngram(char kind, int a, int b) {
+            this.kind = kind;
+            this.a = a;
+            this.b = b;
+        }
+
+        static Ngram unigram(int bi) {
+            return new Ngram('U', bi, -1);
+        }
+
+        static Ngram bigram(int bi, int bi1) {
+            return new Ngram('B', bi, bi1);
+        }
+
+        static Ngram anchored(int low, int next) {
+            return new Ngram('A', low, next);
+        }
+
+        static Ngram anchoredNoNext(int low) {
+            return new Ngram('a', low, -1);
+        }
+
+        static Ngram stride2(int b0, int b1) {
+            return new Ngram('S', b0, b1);
+        }
+
+        boolean equalsNgram(Ngram o) {
+            return kind == o.kind && a == o.a && b == o.b;
+        }
+
+        String describe() {
+            switch (kind) {
+                case 'U':
+                    return String.format(Locale.ROOT, "UNIGRAM  0x%02X       
(%s)",
+                            a, letterHint(a));
+                case 'B':
+                    return String.format(Locale.ROOT, "BIGRAM   0x%02X 0x%02X 
(%s, %s)",
+                            a, b, letterHint(a), letterHint(b));
+                case 'A':
+                    return String.format(Locale.ROOT, "ANCHORED 0x%02X 0x%02X 
(%s after high byte)",
+                            a, b, asciiHint(a));
+                case 'a':
+                    return String.format(Locale.ROOT, "ANCHOR-L 0x%02X       
(%s at end after high byte)",
+                            a, asciiHint(a));
+                case 'S':
+                    return String.format(Locale.ROOT, "STRIDE2  0x%02X 0x%02X",
+                            a, b);
+                default:
+                    return "?";
+            }
+        }
+
+        private static String letterHint(int v) {
+            if (v < 0x80) {
+                return asciiHint(v);
+            }
+            if (v == 0xC7) return "alef[1256]/Ă[852]";
+            if (v == 0xE1) return "lam[1256]/ß[852]";
+            if (v == 0xE3) return "meem[1256]/Ń[852]";
+            if (v == 0xCA) return "teh[1256]/╩[852]";
+            if (v == 0xD1) return "reh[1256]/Đ[852]";
+            if (v == 0xED) return "yeh[1256]/ý[852]";
+            if (v == 0xE7) return "ain[1256]/š[852]";
+            if (v == 0xCF) return "ithal[1256]/¤[852]";
+            if (v == 0xE4) return "nun[1256]/ń[852]";
+            if (v == 0xE6) return "waw[1256]/Š[852]";
+            if (v == 0xE9) return "yeh[1256]/Ú[852]";
+            if (v == 0xF4) return "fathaton[1256]/─[852]";
+            return String.format(Locale.ROOT, "hi-%02X", v);
+        }
+
+        private static String asciiHint(int v) {
+            if (v == 0x20) return "SP";
+            if (v == 0x0A) return "LF";
+            if (v == 0x0D) return "CR";
+            if (v >= 0x21 && v <= 0x7E) return "'" + ((char) v) + "'";
+            return String.format(Locale.ROOT, "\\x%02X", v);
+        }
+    }
+}
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
index c2659396d2..88469abab9 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java
@@ -65,16 +65,20 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
     public static final int GLOBAL_FEATURE_COUNT = 6;
 
     private final int numBuckets;
-    private final int hashBuckets;
+    private final int stride1Buckets;    // size of the stride-1 hash region
+    private final int stride2Buckets;    // size of the stride-2 hash region
+    private final int stride2Base;       // first slot of the stride-2 region
+    private final int globalBase;        // first slot of the globals region 
(or numBuckets if disabled)
     private final boolean useUnigrams;
     private final boolean useBigrams;
     private final boolean useTrigrams;
     private final boolean useAnchoredBigrams;
     private final boolean useStride2Bigrams;
     private final boolean useGlobalFeatures;
+    private final boolean useSplitSpaces;
 
     /**
-     * Backwards-compatible constructor (no global features).
+     * Backwards-compatible constructor (no global features, no split spaces).
      */
     public ConfigurableByteNgramFeatureExtractor(int numBuckets,
                                                  boolean useUnigrams,
@@ -86,13 +90,31 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 useAnchoredBigrams, useStride2Bigrams, false);
     }
 
+    /**
+     * Constructor with globals support, shared hash space (stride-1 and 
stride-2
+     * mod into the same bucket range).
+     */
+    public ConfigurableByteNgramFeatureExtractor(int numBuckets,
+                                                 boolean useUnigrams,
+                                                 boolean useBigrams,
+                                                 boolean useTrigrams,
+                                                 boolean useAnchoredBigrams,
+                                                 boolean useStride2Bigrams,
+                                                 boolean useGlobalFeatures) {
+        this(numBuckets, useUnigrams, useBigrams, useTrigrams,
+                useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures, 
false);
+    }
+
     /**
      * @param numBuckets         total feature-vector dimension.  When
      *                           {@code useGlobalFeatures} is {@code true}, the
      *                           last {@link #GLOBAL_FEATURE_COUNT} slots are
-     *                           reserved for global features and hashed n-gram
-     *                           features mod into the first
-     *                           {@code numBuckets - GLOBAL_FEATURE_COUNT} 
slots.
+     *                           reserved for global features.  When
+     *                           {@code useSplitSpaces} is {@code true}, the
+     *                           remaining hash space is split 50/50 between
+     *                           stride-1 features and stride-2 features so
+     *                           HTML-shaped stride-2 emissions cannot collide
+     *                           with single-byte-charset stride-1 weights.
      * @param useUnigrams        emit unigram for each high byte
      * @param useBigrams         emit bigram anchored on each high byte
      * @param useTrigrams        emit trigram anchored on each high byte
@@ -100,6 +122,8 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
      * @param useStride2Bigrams  emit stride-2 bigrams at even positions (all 
bytes)
      * @param useGlobalFeatures  emit whole-probe global features into the
      *                           reserved tail slots (ASCII-density bins)
+     * @param useSplitSpaces     give stride-1 and stride-2 features disjoint
+     *                           bucket ranges
      */
     public ConfigurableByteNgramFeatureExtractor(int numBuckets,
                                                  boolean useUnigrams,
@@ -107,17 +131,37 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                                                  boolean useTrigrams,
                                                  boolean useAnchoredBigrams,
                                                  boolean useStride2Bigrams,
-                                                 boolean useGlobalFeatures) {
+                                                 boolean useGlobalFeatures,
+                                                 boolean useSplitSpaces) {
         if (numBuckets <= 0) {
             throw new IllegalArgumentException("numBuckets must be positive: " 
+ numBuckets);
         }
-        if (useGlobalFeatures && numBuckets <= GLOBAL_FEATURE_COUNT) {
+        int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0;
+        int hashSpace = numBuckets - globalsReserved;
+        if (hashSpace <= 0) {
             throw new IllegalArgumentException(
                     "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + 
GLOBAL_FEATURE_COUNT
                             + ") when useGlobalFeatures=true: " + numBuckets);
         }
+        if (useSplitSpaces && hashSpace < 2) {
+            throw new IllegalArgumentException(
+                    "useSplitSpaces requires hashSpace >= 2: " + hashSpace);
+        }
         this.numBuckets = numBuckets;
-        this.hashBuckets = useGlobalFeatures ? numBuckets - 
GLOBAL_FEATURE_COUNT : numBuckets;
+        this.useSplitSpaces = useSplitSpaces;
+        if (useSplitSpaces) {
+            // 50/50 split; stride-1 gets the first half, stride-2 gets the 
second.
+            this.stride1Buckets = hashSpace / 2;
+            this.stride2Buckets = hashSpace - this.stride1Buckets;
+            this.stride2Base = this.stride1Buckets;
+        } else {
+            // Both stride families share the same hash region [0, hashSpace).
+            this.stride1Buckets = hashSpace;
+            this.stride2Buckets = hashSpace;
+            this.stride2Base = 0;
+        }
+        // Globals region always starts immediately after the hash region(s).
+        this.globalBase = hashSpace;
         this.useUnigrams = useUnigrams;
         this.useBigrams = useBigrams;
         this.useTrigrams = useTrigrams;
@@ -211,7 +255,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
 
             if (useUnigrams) {
                 int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
-                int bkt = (h & 0x7fffffff) % hashBuckets;
+                int bkt = stride1Bucket(h);
                 if (dense[bkt] == 0) {
                     touched[n++] = bkt;
                 }
@@ -224,7 +268,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 if (useBigrams) {
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
-                    int bkt = (h & 0x7fffffff) % hashBuckets;
+                    int bkt = stride1Bucket(h);
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -236,7 +280,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     if (i + 2 < input.length) {
                         h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME;
                     }
-                    int bkt = (h & 0x7fffffff) % hashBuckets;
+                    int bkt = stride1Bucket(h);
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -248,7 +292,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
                     h = (h ^ bi2) * FNV_PRIME;
-                    int bkt = (h & 0x7fffffff) % hashBuckets;
+                    int bkt = stride1Bucket(h);
                     if (dense[bkt] == 0) {
                         touched[n++] = bkt;
                     }
@@ -264,7 +308,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 int b1 = input[i + 1] & 0xFF;
                 int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
                 h = (h ^ b1) * FNV_PRIME;
-                int bkt = (h & 0x7fffffff) % hashBuckets;
+                int bkt = stride2Bucket(h);
                 if (dense[bkt] == 0) {
                     touched[n++] = bkt;
                 }
@@ -274,7 +318,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
 
         // Global features at reserved tail slots: fire exactly one 
ASCII-density bin.
         if (useGlobalFeatures) {
-            int bkt = hashBuckets + asciiDensityBin(input);
+            int bkt = globalBase + asciiDensityBin(input);
             if (dense[bkt] == 0) {
                 touched[n++] = bkt;
             }
@@ -293,7 +337,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
             }
 
             if (useUnigrams) {
-                counts[bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++;
+                counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++;
             }
 
             if (i + 1 < to) {
@@ -302,7 +346,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 if (useBigrams) {
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
-                    counts[bucket(h)]++;
+                    counts[stride1Bucket(h)]++;
                 }
 
                 if (useAnchoredBigrams && bi1 < 0x80) {
@@ -310,7 +354,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     if (i + 2 < to) {
                         h = (h ^ (b[i + 2] & 0xFF)) * FNV_PRIME;
                     }
-                    counts[bucket(h)]++;
+                    counts[stride1Bucket(h)]++;
                 }
 
                 if (useTrigrams && i + 2 < to) {
@@ -318,7 +362,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                     int h = (FNV_OFFSET ^ bi) * FNV_PRIME;
                     h = (h ^ bi1) * FNV_PRIME;
                     h = (h ^ bi2) * FNV_PRIME;
-                    counts[bucket(h)]++;
+                    counts[stride1Bucket(h)]++;
                 }
             }
         }
@@ -330,7 +374,7 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
                 int b1 = b[i + 1] & 0xFF;
                 int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME;
                 h = (h ^ b1) * FNV_PRIME;
-                counts[bucket(h)]++;
+                counts[stride2Bucket(h)]++;
             }
         }
 
@@ -338,12 +382,16 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
         if (useGlobalFeatures) {
             byte[] slice = (from == 0 && to == b.length)
                     ? b : java.util.Arrays.copyOfRange(b, from, to);
-            counts[hashBuckets + asciiDensityBin(slice)]++;
+            counts[globalBase + asciiDensityBin(slice)]++;
         }
     }
 
-    private int bucket(int hash) {
-        return (hash & 0x7fffffff) % hashBuckets;
+    private int stride1Bucket(int hash) {
+        return (hash & 0x7fffffff) % stride1Buckets;
+    }
+
+    private int stride2Bucket(int hash) {
+        return stride2Base + (hash & 0x7fffffff) % stride2Buckets;
     }
 
     @Override
@@ -351,11 +399,18 @@ public class ConfigurableByteNgramFeatureExtractor 
implements FeatureExtractor<b
         return numBuckets;
     }
 
+    public boolean isUseSplitSpaces() {
+        return useSplitSpaces;
+    }
+
     @Override
     public String toString() {
         return String.format(java.util.Locale.ROOT,
-                "ConfigurableByteNgramFeatureExtractor{buckets=%d, hash=%d, 
uni=%b, bi=%b, tri=%b, anchored=%b, stride2=%b, globals=%b}",
-                numBuckets, hashBuckets, useUnigrams, useBigrams, useTrigrams,
-                useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures);
+                "ConfigurableByteNgramFeatureExtractor{buckets=%d, 
stride1=[0,%d) stride2=[%d,%d) globals=[%d,%d)"
+                        + " uni=%b, bi=%b, tri=%b, anchored=%b, stride2f=%b, 
globalsf=%b, split=%b}",
+                numBuckets, stride1Buckets, stride2Base, stride2Base + 
stride2Buckets,
+                globalBase, numBuckets,
+                useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams,
+                useStride2Bigrams, useGlobalFeatures, useSplitSpaces);
     }
 }
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
new file mode 100644
index 0000000000..dfe13b3ade
--- /dev/null
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java
@@ -0,0 +1,380 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect.tools;
+
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.ml.FeatureExtractor;
+import org.apache.tika.ml.LinearModel;
+import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor;
+
+/**
+ * Forensic trace for a single probe: top-15 raw logits, per-class bucket
+ * contribution breakdown, and probe statistics.  Helps diagnose cases where
+ * the model is confidently wrong (e.g. the Arabic-vs-IBM852 rank-15 case).
+ *
+ * <p>Usage:
+ * <pre>
+ *   java TraceCharsetLogits --probe &lt;file&gt; [--model &lt;path&gt;]
+ *                           [--focus label1,label2,...] [--top-buckets N]
+ *                           [--max-probe-bytes N]
+ * </pre>
+ */
+public final class TraceCharsetLogits {
+
+    private TraceCharsetLogits() {
+    }
+
+    public static void main(String[] args) throws Exception {
+        Path probePath = null;
+        Path modelPath = null;
+        List<String> focus = new ArrayList<>();
+        int topBuckets = 20;
+        int maxProbeBytes = 32 * 1024;
+        boolean noStride2 = false;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--probe":
+                    probePath = Paths.get(args[++i]);
+                    break;
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--focus":
+                    for (String s : args[++i].split(",")) {
+                        focus.add(s.trim());
+                    }
+                    break;
+                case "--top-buckets":
+                    topBuckets = Integer.parseInt(args[++i]);
+                    break;
+                case "--max-probe-bytes":
+                    maxProbeBytes = Integer.parseInt(args[++i]);
+                    break;
+                case "--no-stride2":
+                    noStride2 = true;
+                    break;
+                default:
+                    System.err.println("Unknown arg: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        if (probePath == null) {
+            System.err.println("Usage: TraceCharsetLogits --probe <file> 
[--model <path>] "
+                    + "[--focus <label1,label2,...>] [--top-buckets N] 
[--max-probe-bytes N]");
+            System.exit(1);
+        }
+
+        LinearModel model = loadModel(modelPath);
+        FeatureExtractor<byte[]> extractor = noStride2
+                // Production flags minus stride-2, matching 
FeatureExtractorParityTest
+                // for the stride-1 features (uni + bi, no trigrams, no 
anchored).
+                ? new 
ConfigurableByteNgramFeatureExtractor(model.getNumBuckets(),
+                        true, true, false, false, false)
+                : new ByteNgramFeatureExtractor(model.getNumBuckets());
+        if (noStride2) {
+            System.out.println("Stride-2 features suppressed for this run.");
+        }
+
+        byte[] allBytes = Files.readAllBytes(probePath);
+        byte[] probe = allBytes.length <= maxProbeBytes
+                ? allBytes
+                : Arrays.copyOf(allBytes, maxProbeBytes);
+
+        printProbeStats(probePath, allBytes.length, probe);
+
+        int[] features = extractor.extract(probe);
+        float[] logits = model.predictLogits(features);
+
+        String[] labels = model.getLabels();
+        int numClasses = labels.length;
+
+        // Top-15 by raw logit
+        Integer[] order = new Integer[numClasses];
+        for (int i = 0; i < numClasses; i++) {
+            order[i] = i;
+        }
+        Arrays.sort(order, Comparator.comparingDouble((Integer i) -> 
-logits[i]));
+
+        System.out.println();
+        System.out.println("Top-15 raw logits:");
+        System.out.println("  rank  label                     logit       
gap-from-top");
+        float topLogit = logits[order[0]];
+        for (int r = 0; r < Math.min(15, numClasses); r++) {
+            int c = order[r];
+            System.out.printf(Locale.ROOT,
+                    "  %3d   %-24s  %10.1f  %+10.1f%n",
+                    r + 1, labels[c], logits[c], logits[c] - topLogit);
+        }
+
+        // Per-class bucket contribution breakdown for top-1 and any --focus 
classes
+        List<String> forensic = new ArrayList<>();
+        forensic.add(labels[order[0]]);
+        for (String f : focus) {
+            if (!forensic.contains(f)) {
+                forensic.add(f);
+            }
+        }
+
+        byte[][] weights = model.getWeights();
+        float[] scales = model.getScales();
+        float[] biases = model.getBiases();
+        int numBuckets = model.getNumBuckets();
+
+        for (String label : forensic) {
+            int c = indexOf(labels, label);
+            if (c < 0) {
+                System.out.println();
+                System.out.println("(label '" + label + "' not in model)");
+                continue;
+            }
+            System.out.println();
+            System.out.printf(Locale.ROOT, "Per-bucket contributions for %s 
(class %d, bias=%.2f, scale=%.4g):%n",
+                    label, c, biases[c], scales[c]);
+
+            float clip = 1.5f * (float) Math.sqrt(nnz(features));
+
+            BucketContrib[] contribs = new BucketContrib[numBuckets];
+            int nContribs = 0;
+            for (int b = 0; b < numBuckets; b++) {
+                if (features[b] == 0) {
+                    continue;
+                }
+                float raw = scales[c] * weights[c][b] * features[b];
+                float clipped = Math.max(-clip, Math.min(clip, raw));
+                contribs[nContribs++] = new BucketContrib(b, features[b], 
weights[c][b],
+                        raw, clipped);
+            }
+            BucketContrib[] trim = Arrays.copyOf(contribs, nContribs);
+            Arrays.sort(trim, (a, bb) -> Float.compare(Math.abs(bb.clipped), 
Math.abs(a.clipped)));
+
+            double sumClipped = 0, sumRaw = 0;
+            for (BucketContrib bc : trim) {
+                sumClipped += bc.clipped;
+                sumRaw += bc.raw;
+            }
+            System.out.printf(Locale.ROOT,
+                    "  active buckets: %d   sum(clipped)=%.1f   sum(raw)=%.1f  
 bias=%.2f   "
+                            + "logit=%.1f   clip=%.2f%n",
+                    nContribs, sumClipped, sumRaw, biases[c],
+                    sumClipped + biases[c], clip);
+
+            System.out.printf(Locale.ROOT,
+                    "  top-%d buckets by |clipped contribution|:%n", 
topBuckets);
+            System.out.println("    bucket    count   weight(INT8)   raw       
   clipped");
+            for (int k = 0; k < Math.min(topBuckets, trim.length); k++) {
+                BucketContrib bc = trim[k];
+                System.out.printf(Locale.ROOT,
+                        "    %7d   %5d   %+5d         %+10.2f  %+10.2f%n",
+                        bc.bucket, bc.count, bc.weight, bc.raw, bc.clipped);
+            }
+        }
+
+        // For any pair of focus classes (or top-1 + first focus), show shared 
buckets.
+        if (forensic.size() >= 2) {
+            String a = forensic.get(0);
+            String b = forensic.get(1);
+            int ca = indexOf(labels, a);
+            int cb = indexOf(labels, b);
+            if (ca >= 0 && cb >= 0) {
+                System.out.println();
+                System.out.printf(Locale.ROOT,
+                        "Head-to-head bucket comparison: %s vs %s%n", a, b);
+                System.out.println("    bucket    count   wA      wB     
raw-diff   "
+                        + "(wA-wB)*scale*count ~ net logit delta for A over 
B");
+                float scA = scales[ca];
+                float scB = scales[cb];
+                List<BucketDiff> diffs = new ArrayList<>();
+                for (int bk = 0; bk < numBuckets; bk++) {
+                    if (features[bk] == 0) {
+                        continue;
+                    }
+                    float rawA = scA * weights[ca][bk] * features[bk];
+                    float rawB = scB * weights[cb][bk] * features[bk];
+                    float diff = rawA - rawB;
+                    diffs.add(new BucketDiff(bk, features[bk],
+                            weights[ca][bk], weights[cb][bk], rawA, rawB, 
diff));
+                }
+                diffs.sort((x, y) -> Float.compare(Math.abs(y.diff), 
Math.abs(x.diff)));
+                for (int k = 0; k < Math.min(topBuckets, diffs.size()); k++) {
+                    BucketDiff d = diffs.get(k);
+                    System.out.printf(Locale.ROOT,
+                            "    %7d   %5d   %+4d    %+4d    %+10.2f   
%+10.2f%n",
+                            d.bucket, d.count, d.wA, d.wB, d.rawA - d.rawB, 
d.diff);
+                }
+            }
+        }
+    }
+
+    private static int nnz(int[] features) {
+        int n = 0;
+        for (int v : features) {
+            if (v != 0) {
+                n++;
+            }
+        }
+        return n;
+    }
+
+    private static int indexOf(String[] labels, String target) {
+        for (int i = 0; i < labels.length; i++) {
+            if (labels[i].equalsIgnoreCase(target)) {
+                return i;
+            }
+        }
+        return -1;
+    }
+
+    private static LinearModel loadModel(Path modelPath) throws Exception {
+        if (modelPath != null) {
+            return LinearModel.loadFromPath(modelPath);
+        }
+        // Default: the model shipped with mojibuster.
+        String res = 
"/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin";
+        try (InputStream is = 
TraceCharsetLogits.class.getResourceAsStream(res)) {
+            if (is == null) {
+                throw new IllegalStateException("default model resource not 
found: " + res);
+            }
+            return LinearModel.load(is);
+        }
+    }
+
+    private static void printProbeStats(Path p, long fileSize, byte[] probe) {
+        int[] hist = new int[256];
+        int high = 0, c1 = 0, nul = 0, ascii = 0, asciiText = 0;
+        for (byte b : probe) {
+            int v = b & 0xFF;
+            hist[v]++;
+            if (v >= 0x80) {
+                high++;
+            }
+            if (v >= 0x80 && v < 0xA0) {
+                c1++;
+            }
+            if (v == 0) {
+                nul++;
+            }
+            if (v < 0x80) {
+                ascii++;
+            }
+            if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 
0x0D) {
+                asciiText++;
+            }
+        }
+        System.out.println("Probe trace");
+        System.out.printf(Locale.ROOT, "  file         : %s%n", p);
+        System.out.printf(Locale.ROOT, "  file size    : %,d bytes (probe: 
%,d)%n", fileSize, probe.length);
+        System.out.printf(Locale.ROOT,
+                "  high bytes   : %,d (%.2f%%)    ASCII: %,d (%.2f%%)    
ASCII-text: %,d (%.2f%%)%n",
+                high, 100.0 * high / probe.length,
+                ascii, 100.0 * ascii / probe.length,
+                asciiText, 100.0 * asciiText / probe.length);
+        System.out.printf(Locale.ROOT,
+                "  C1 (0x80-9F) : %,d (%.2f%%)    NUL: %,d%n",
+                c1, 100.0 * c1 / probe.length, nul);
+
+        // High-byte range distribution
+        int[] ranges = new int[4];  // 0x80-BF, 0xC0-DF, 0xE0-EF, 0xF0-FF
+        for (int v = 0x80; v < 0x100; v++) {
+            int bucket;
+            if (v < 0xC0) {
+                bucket = 0;
+            } else if (v < 0xE0) {
+                bucket = 1;
+            } else if (v < 0xF0) {
+                bucket = 2;
+            } else {
+                bucket = 3;
+            }
+            ranges[bucket] += hist[v];
+        }
+        int highTotal = ranges[0] + ranges[1] + ranges[2] + ranges[3];
+        if (highTotal > 0) {
+            System.out.printf(Locale.ROOT,
+                    "  high ranges  : 0x80-BF=%.1f%%   0xC0-DF=%.1f%%   
0xE0-EF=%.1f%%   0xF0-FF=%.1f%%%n",
+                    100.0 * ranges[0] / highTotal,
+                    100.0 * ranges[1] / highTotal,
+                    100.0 * ranges[2] / highTotal,
+                    100.0 * ranges[3] / highTotal);
+        }
+
+        // Top 10 most frequent high-byte values
+        Integer[] idx = new Integer[256];
+        for (int i = 0; i < 256; i++) {
+            idx[i] = i;
+        }
+        Arrays.sort(idx, (a, b) -> Integer.compare(hist[b], hist[a]));
+        StringBuilder sb = new StringBuilder("  top high bytes: ");
+        int shown = 0;
+        for (int i : idx) {
+            if (shown >= 10 || hist[i] == 0) {
+                break;
+            }
+            if (i < 0x80) {
+                continue;
+            }
+            sb.append(String.format(Locale.ROOT, "0x%02X(%d) ", i, hist[i]));
+            shown++;
+        }
+        System.out.println(sb);
+    }
+
+    private static final class BucketContrib {
+        final int bucket;
+        final int count;
+        final byte weight;
+        final float raw;
+        final float clipped;
+
+        BucketContrib(int bucket, int count, byte weight, float raw, float 
clipped) {
+            this.bucket = bucket;
+            this.count = count;
+            this.weight = weight;
+            this.raw = raw;
+            this.clipped = clipped;
+        }
+    }
+
+    private static final class BucketDiff {
+        final int bucket;
+        final int count;
+        final byte wA;
+        final byte wB;
+        final float rawA;
+        final float rawB;
+        final float diff;
+
+        BucketDiff(int bucket, int count, byte wA, byte wB, float rawA, float 
rawB, float diff) {
+            this.bucket = bucket;
+            this.count = count;
+            this.wA = wA;
+            this.wB = wB;
+            this.rawA = rawA;
+            this.rawB = rawB;
+            this.diff = diff;
+        }
+    }
+}
diff --git 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
index 7a38d3bce9..1e7a7e5cdf 100644
--- 
a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
+++ 
b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java
@@ -82,6 +82,7 @@ public class TrainCharsetModel {
         boolean useAnchoredBigrams = false;
         boolean useStride2Bigrams = true;
         boolean useGlobalFeatures = false;
+        boolean useSplitSpaces = false;
         // --label-remap src1:dst1,src2:dst2 — merges multiple source labels 
into
         // one target label at training time (e.g. merge script variants into 
one class).
         Map<String, String> labelRemap = new HashMap<>();
@@ -147,6 +148,12 @@ public class TrainCharsetModel {
                 case "--no-globals":
                     useGlobalFeatures = false;
                     break;
+                case "--split-spaces":
+                    useSplitSpaces = true;
+                    break;
+                case "--no-split-spaces":
+                    useSplitSpaces = false;
+                    break;
                 case "--exclude":
                     for (String label : args[++i].split(",")) {
                         excludeLabels.add(label.trim());
@@ -172,6 +179,7 @@ public class TrainCharsetModel {
             System.err.println("  --anchored / --no-anchored  anchored bigrams 
(default: off)");
             System.err.println("  --stride2 / --no-stride2    stride-2 bigrams 
at even positions (default: on)");
             System.err.println("  --globals / --no-globals    emit global 
ASCII-density bin features (default: off)");
+            System.err.println("  --split-spaces / --no-split-spaces  give 
stride-1 and stride-2 features disjoint bucket ranges (default: off)");
             System.err.println("  --exclude cs1,cs2          skip these 
charset labels (e.g. UTF-32-BE,UTF-32-LE)");
             System.exit(1);
         }
@@ -219,14 +227,14 @@ public class TrainCharsetModel {
                 "Buckets: %d  epochs: %d  lr: %.4f  max-samples/class: %d%n",
                 numBuckets, epochs, lr, maxSamplesPerClass);
         System.out.printf(java.util.Locale.ROOT,
-                "Features: uni=%b  bi=%b  tri=%b  anchored=%b  stride2=%b  
globals=%b%n",
+                "Features: uni=%b  bi=%b  tri=%b  anchored=%b  stride2=%b  
globals=%b  split=%b%n",
                 useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, 
useStride2Bigrams,
-                useGlobalFeatures);
+                useGlobalFeatures, useSplitSpaces);
 
         ConfigurableByteNgramFeatureExtractor extractor =
                 new ConfigurableByteNgramFeatureExtractor(numBuckets,
                         useUnigrams, useBigrams, useTrigrams, 
useAnchoredBigrams,
-                        useStride2Bigrams, useGlobalFeatures);
+                        useStride2Bigrams, useGlobalFeatures, useSplitSpaces);
 
         // Build class index map
         Map<String, Integer> labelIndex = new HashMap<>();
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
index c40ef78075..3958d86d81 100644
--- 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
+++ 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java
@@ -157,4 +157,77 @@ public class ConfigurableGlobalFeatureTest {
                     "bucket " + i + " differs between dense and sparse paths");
         }
     }
+
+    // --- split-space layout ---
+
+    private static final int SPLIT_NUM_BUCKETS = 32768 + 
ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
+
+    private static ConfigurableByteNgramFeatureExtractor withSplitAndGlobals() 
{
+        return new ConfigurableByteNgramFeatureExtractor(
+                SPLIT_NUM_BUCKETS, true, true, false, false, true, true, true);
+    }
+
+    @Test
+    public void splitSpacesStride1FiresOnlyLowRegion() {
+        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
+        int[] dense = new int[SPLIT_NUM_BUCKETS];
+        int[] touched = new int[SPLIT_NUM_BUCKETS];
+        // High bytes only — fires stride-1 unigrams + bigrams + stride-2 pairs
+        byte[] probe = new byte[]{(byte) 0xE4, (byte) 0xF6, (byte) 0xFC};
+        int n = ext.extractSparseInto(probe, dense, touched);
+
+        // stride-1 firings must be in [0, 16384), stride-2 in [16384, 32768),
+        // globals in [32768, 32774).
+        int stride1Count = 0;
+        int stride2Count = 0;
+        int globalCount = 0;
+        for (int i = 0; i < n; i++) {
+            int bkt = touched[i];
+            if (bkt < 16384) {
+                stride1Count++;
+            } else if (bkt < 32768) {
+                stride2Count++;
+            } else {
+                globalCount++;
+            }
+        }
+        assertTrue(stride1Count > 0, "expected stride-1 firings in low 
region");
+        assertTrue(stride2Count > 0, "expected stride-2 firings in high 
region");
+        assertEquals(1, globalCount, "exactly one global bin fires");
+    }
+
+    @Test
+    public void splitSpacesAsciiProbeFiresOnlyStride2AndGlobals() {
+        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
+        int[] dense = new int[SPLIT_NUM_BUCKETS];
+        int[] touched = new int[SPLIT_NUM_BUCKETS];
+        // Pure ASCII — no stride-1 firings (no high bytes), all firings are
+        // stride-2 (HTML markup-shaped pairs) + the globals bin.
+        byte[] probe = "Hello, world! This is ASCII only.\r\n"
+                .getBytes(StandardCharsets.US_ASCII);
+        int n = ext.extractSparseInto(probe, dense, touched);
+
+        for (int i = 0; i < n; i++) {
+            int bkt = touched[i];
+            assertTrue(bkt >= 16384,
+                    "ASCII probe must NOT fire any stride-1 slot, got bkt=" + 
bkt);
+        }
+    }
+
+    @Test
+    public void splitSpacesDenseSparseAgree() {
+        ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals();
+        byte[] probe = "r\u00E9sum\u00E9 caf\u00E9"
+                .getBytes(StandardCharsets.ISO_8859_1);
+
+        int[] dense = ext.extract(probe);
+        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
+        int[] touched = new int[SPLIT_NUM_BUCKETS];
+        ext.extractSparseInto(probe, sparseDense, touched);
+
+        for (int i = 0; i < SPLIT_NUM_BUCKETS; i++) {
+            assertEquals(dense[i], sparseDense[i],
+                    "bucket " + i + " differs between dense and sparse paths 
(split layout)");
+        }
+    }
 }
diff --git 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
index d2de48f423..900a5dbb5c 100644
--- 
a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
+++ 
b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java
@@ -254,4 +254,101 @@ public class FeatureExtractorParityTest {
         assertArrayEquals(dense, sparseDense,
                 "ByteNgramFeatureExtractor: extract() vs extractSparseInto() 
differ");
     }
+
+    // =====================================================================
+    // Parity in the split-spaces + globals layout (next-generation model).
+    // =====================================================================
+
+    private static final int SPLIT_NUM_BUCKETS =
+            32768 + ByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT;
+
+    private final ByteNgramFeatureExtractor productionSplit =
+            new ByteNgramFeatureExtractor(SPLIT_NUM_BUCKETS, true, true);
+
+    private final ConfigurableByteNgramFeatureExtractor configurableSplit =
+            new ConfigurableByteNgramFeatureExtractor(
+                    SPLIT_NUM_BUCKETS,
+                    true,   // unigrams
+                    true,   // bigrams
+                    false,  // trigrams OFF
+                    false,  // anchored OFF
+                    true,   // stride2 ON
+                    true,   // globals ON
+                    true);  // split spaces ON
+
+    private void assertSplitParity(byte[] probe) {
+        int[] prodFeatures = productionSplit.extract(probe);
+        int[] confFeatures = configurableSplit.extract(probe);
+        assertEquals(prodFeatures.length, confFeatures.length,
+                "split-layout feature vector lengths differ");
+        for (int i = 0; i < prodFeatures.length; i++) {
+            if (prodFeatures[i] != confFeatures[i]) {
+                org.junit.jupiter.api.Assertions.fail(String.format(
+                        "split-layout bucket %d: production=%d, 
configurable=%d",
+                        i, prodFeatures[i], confFeatures[i]));
+            }
+        }
+    }
+
+    @Test
+    public void splitParityOnPureAscii() {
+        assertSplitParity("Hello, world! This is ASCII text.\r\n"
+                .getBytes(StandardCharsets.US_ASCII));
+    }
+
+    @Test
+    public void splitParityOnHighByteContent() {
+        assertSplitParity(new byte[]{
+                (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75,
+                (byte) 0x6D, (byte) 0xE9, (byte) 0x20,
+                (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9
+        });
+    }
+
+    @Test
+    public void splitParityOnRealUtf16Le() {
+        assertSplitParity("日本語テスト".getBytes(StandardCharsets.UTF_16LE));
+    }
+
+    @Test
+    public void splitParityOnArabicLike() {
+        // Synthesized Arabic-style byte pattern: 0xC7/0xE1/0xE3 alef/lam/meem
+        byte[] probe = new byte[]{
+                (byte) 0xC7, (byte) 0xE1, (byte) 0xE3, 0x20,
+                (byte) 0xD9, (byte) 0xED, (byte) 0xC7, (byte) 0xE1,
+                (byte) 0xCA, (byte) 0xD1, 0x0D, 0x0A
+        };
+        assertSplitParity(probe);
+    }
+
+    @Test
+    public void splitParityOnLongMixedProbe() {
+        byte[] probe = new byte[4096];
+        for (int i = 0; i < probe.length; i++) {
+            probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i 
% 96)));
+        }
+        assertSplitParity(probe);
+    }
+
+    @Test
+    public void splitLayoutProductionDenseMatchesSparse() {
+        byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE);
+        int[] dense = productionSplit.extract(probe);
+        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
+        int[] touched = new int[SPLIT_NUM_BUCKETS];
+        productionSplit.extractSparseInto(probe, sparseDense, touched);
+        assertArrayEquals(dense, sparseDense,
+                "split layout: production extract() vs extractSparseInto() 
differ");
+    }
+
+    @Test
+    public void splitLayoutConfigurableDenseMatchesSparse() {
+        byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE);
+        int[] dense = configurableSplit.extract(probe);
+        int[] sparseDense = new int[SPLIT_NUM_BUCKETS];
+        int[] touched = new int[SPLIT_NUM_BUCKETS];
+        configurableSplit.extractSparseInto(probe, sparseDense, touched);
+        assertArrayEquals(dense, sparseDense,
+                "split layout: configurable extract() vs extractSparseInto() 
differ");
+    }
 }

(tika) 02/03: split strides into separate model space

Reply via email to