This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch charset-detection-improvements in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4b6c3b60e2053681325ccfcca39752b4ceafed82 Author: tallison <[email protected]> AuthorDate: Mon Apr 13 13:22:32 2026 -0400 split strides into separate model space --- .../ml/chardetect/ByteNgramFeatureExtractor.java | 145 ++++++- .../ml/chardetect/tools/BucketCollisionAudit.java | 459 +++++++++++++++++++++ .../ConfigurableByteNgramFeatureExtractor.java | 105 +++-- .../ml/chardetect/tools/TraceCharsetLogits.java | 380 +++++++++++++++++ .../ml/chardetect/tools/TrainCharsetModel.java | 14 +- .../chardetect/ConfigurableGlobalFeatureTest.java | 73 ++++ .../ml/chardetect/FeatureExtractorParityTest.java | 97 +++++ 7 files changed, 1232 insertions(+), 41 deletions(-) diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java index 1dfb9cfe23..baa67fbc47 100644 --- a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/main/java/org/apache/tika/ml/chardetect/ByteNgramFeatureExtractor.java @@ -93,22 +93,121 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { /** Distinct salt for stride-2 bigrams — prevents collision with stride-1 hashes. */ private static final int FNV_STRIDE2_SALT = 0x9e3779b9; + /** + * Number of reserved slots at the high end of the feature vector used for + * global (whole-probe) features when {@link #useGlobalFeatures} is enabled. + * Currently 6 slots hold ASCII-text-density bins (see + * {@link #asciiDensityBin(byte[])}). Must match the training-side + * {@code ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT}. + */ + public static final int GLOBAL_FEATURE_COUNT = 6; + private final int numBuckets; + private final int stride1Buckets; + private final int stride2Buckets; + private final int stride2Base; + private final int globalBase; + private final boolean useGlobalFeatures; + private final boolean useSplitSpaces; /** - * Create an extractor with the production feature set (UBT-: unigrams + - * bigrams + trigrams, no anchored bigrams) and the given bucket count. - * The bucket count must match the model the extractor will be paired with — - * in practice this is read from the model binary via - * {@link org.apache.tika.ml.LinearModel#getNumBuckets()}. + * Legacy constructor: no globals, shared stride-1/stride-2 hash space. + * Matches the layout used by the shipped {@code chardetect-v6-no-utf32.bin}. * * @param numBuckets number of hash buckets (feature-vector dimension) */ public ByteNgramFeatureExtractor(int numBuckets) { + this(numBuckets, false, false); + } + + /** + * Create an extractor matching the layout of a trained model. + * + * @param numBuckets total feature-vector dimension. + * @param useGlobalFeatures reserve the last {@link #GLOBAL_FEATURE_COUNT} + * slots for ASCII-density bin features. + * @param useSplitSpaces split the hash space 50/50 between stride-1 + * features (low half) and stride-2 features + * (high half) so cross-family hash collisions + * cannot pollute single-byte-charset weights + * with stride-2 signals. + */ + public ByteNgramFeatureExtractor(int numBuckets, + boolean useGlobalFeatures, + boolean useSplitSpaces) { if (numBuckets <= 0) { throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets); } + int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0; + int hashSpace = numBuckets - globalsReserved; + if (hashSpace <= 0) { + throw new IllegalArgumentException( + "numBuckets must exceed GLOBAL_FEATURE_COUNT when useGlobalFeatures=true: " + + numBuckets); + } + if (useSplitSpaces && hashSpace < 2) { + throw new IllegalArgumentException( + "useSplitSpaces requires hashSpace >= 2: " + hashSpace); + } this.numBuckets = numBuckets; + this.useSplitSpaces = useSplitSpaces; + this.useGlobalFeatures = useGlobalFeatures; + if (useSplitSpaces) { + this.stride1Buckets = hashSpace / 2; + this.stride2Buckets = hashSpace - this.stride1Buckets; + this.stride2Base = this.stride1Buckets; + } else { + this.stride1Buckets = hashSpace; + this.stride2Buckets = hashSpace; + this.stride2Base = 0; + } + this.globalBase = hashSpace; + } + + /** + * Returns which ASCII-text-density bin this probe falls into, in [0, 6). + * Must match the training-side + * {@code ConfigurableByteNgramFeatureExtractor.asciiDensityBin}. + * + * <p>Bin layout (fraction of bytes that are ASCII-text: printable + * {@code 0x20..0x7E} plus {@code 0x09 0x0A 0x0D}):</p> + * <ul> + * <li>0: [0.00, 0.10)</li> + * <li>1: [0.10, 0.50)</li> + * <li>2: [0.50, 0.80)</li> + * <li>3: [0.80, 0.95)</li> + * <li>4: [0.95, 0.99)</li> + * <li>5: [0.99, 1.00]</li> + * </ul> + */ + public static int asciiDensityBin(byte[] input) { + if (input == null || input.length == 0) { + return 5; + } + int asciiText = 0; + for (byte b : input) { + int v = b & 0xFF; + if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 0x0D) { + asciiText++; + } + } + double p = (double) asciiText / input.length; + if (p < 0.10) { + return 0; + } + if (p < 0.50) { + return 1; + } + if (p < 0.80) { + return 2; + } + if (p < 0.95) { + return 3; + } + if (p < 0.99) { + return 4; + } + return 5; } @Override @@ -166,7 +265,7 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { // Unigram int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -178,7 +277,7 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { // Bigram h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; - bkt = (h & 0x7fffffff) % numBuckets; + bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -193,7 +292,16 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { int b1 = input[i + 1] & 0xFF; int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; h = (h ^ b1) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % numBuckets; + int bkt = stride2Bucket(h); + if (dense[bkt] == 0) { + touched[n++] = bkt; + } + dense[bkt]++; + } + + // Global features: fire exactly one ASCII-density bin. + if (useGlobalFeatures) { + int bkt = globalBase + asciiDensityBin(input); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -212,7 +320,7 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { } // Unigram - counts[bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++; + counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++; if (i + 1 < to) { int bi1 = b[i + 1] & 0xFF; @@ -220,7 +328,7 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { // Bigram int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; - counts[bucket(h)]++; + counts[stride1Bucket(h)]++; } } @@ -230,12 +338,23 @@ public class ByteNgramFeatureExtractor implements FeatureExtractor<byte[]> { int b1 = b[i + 1] & 0xFF; int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; h = (h ^ b1) * FNV_PRIME; - counts[bucket(h)]++; + counts[stride2Bucket(h)]++; } + + // Global features: fire exactly one ASCII-density bin. + if (useGlobalFeatures) { + byte[] slice = (from == 0 && to == b.length) + ? b : java.util.Arrays.copyOfRange(b, from, to); + counts[globalBase + asciiDensityBin(slice)]++; + } + } + + private int stride1Bucket(int hash) { + return (hash & 0x7fffffff) % stride1Buckets; } - private int bucket(int hash) { - return (hash & 0x7fffffff) % numBuckets; + private int stride2Bucket(int hash) { + return stride2Base + (hash & 0x7fffffff) % stride2Buckets; } @Override diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java new file mode 100644 index 0000000000..35a9fcd5cf --- /dev/null +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/BucketCollisionAudit.java @@ -0,0 +1,459 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect.tools; + +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; + +import org.apache.tika.ml.LinearModel; +import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor; + +/** + * Audits hash-bucket collisions for the shipped feature extractor. For a + * given probe, shows which n-grams fired which buckets, and for each bucket + * lists every OTHER n-gram in the extractor's n-gram space that would hash + * to the same bucket. Optionally restricts the "colliding peers" enumeration + * to specific byte-range classes (Arabic vs Central European letters, etc.). + * + * <p>Usage: + * <pre> + * java BucketCollisionAudit --probe <file> [--model <path>] + * [--max-probe-bytes N] [--top N] + * </pre> + * + * <p>Uses the exact FNV constants from {@link ByteNgramFeatureExtractor}. + * Enumerates four feature families: + * <ul> + * <li>Unigrams — one byte in 0x80..0xFF (128 entries)</li> + * <li>Bigrams — high byte then any byte (128 * 256 = 32,768 entries)</li> + * <li>Anchored bigrams — one salt, (low-trail, any) byte pairs + * (128 * 256 = 32,768 entries, only those following a high byte)</li> + * <li>Stride-2 bigrams — (any, any) at even positions (256 * 256 = 65,536 entries)</li> + * </ul> + */ +public final class BucketCollisionAudit { + + private static final int FNV_PRIME = 0x01000193; + private static final int FNV_OFFSET = 0x811c9dc5; + private static final int FNV_ANCHOR_SALT = 0x27d4eb2f; + private static final int FNV_STRIDE2_SALT = 0x9e3779b9; + + private BucketCollisionAudit() { + } + + public static void main(String[] args) throws Exception { + Path probePath = null; + Path modelPath = null; + int maxProbeBytes = 32 * 1024; + int topBuckets = 20; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--probe": + probePath = Paths.get(args[++i]); + break; + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--max-probe-bytes": + maxProbeBytes = Integer.parseInt(args[++i]); + break; + case "--top": + topBuckets = Integer.parseInt(args[++i]); + break; + default: + System.err.println("Unknown arg: " + args[i]); + System.exit(1); + } + } + if (probePath == null) { + System.err.println("Usage: BucketCollisionAudit --probe <file> [--model <path>] " + + "[--max-probe-bytes N] [--top N]"); + System.exit(1); + } + + LinearModel model = loadModel(modelPath); + int numBuckets = model.getNumBuckets(); + ByteNgramFeatureExtractor extractor = new ByteNgramFeatureExtractor(numBuckets); + + // Pre-build inverse map: bucket -> list of n-grams that hash to it. + System.out.printf(Locale.ROOT, + "Building inverse bucket map over %,d buckets (can take a few seconds)...%n", + numBuckets); + List<Ngram>[] inverse = buildInverseMap(numBuckets); + + // Collision-rate summary. + int maxSize = 0; + long totalNgrams = 0; + int populated = 0; + for (List<Ngram> l : inverse) { + if (l == null || l.isEmpty()) { + continue; + } + populated++; + totalNgrams += l.size(); + if (l.size() > maxSize) { + maxSize = l.size(); + } + } + double avg = populated > 0 ? (double) totalNgrams / populated : 0; + System.out.printf(Locale.ROOT, + "n-grams enumerated: %,d populated buckets: %,d / %,d (%.1f%%) " + + "avg n-grams/bucket: %.2f max: %d%n%n", + totalNgrams, populated, numBuckets, + 100.0 * populated / numBuckets, avg, maxSize); + + // Load probe, extract features. + byte[] all = Files.readAllBytes(probePath); + byte[] probe = all.length <= maxProbeBytes ? all : Arrays.copyOf(all, maxProbeBytes); + int[] features = extractor.extract(probe); + + int nnz = 0; + for (int v : features) { + if (v != 0) { + nnz++; + } + } + System.out.printf(Locale.ROOT, + "Probe %s: %,d bytes (probe: %,d), %,d active buckets%n%n", + probePath, all.length, probe.length, nnz); + + // For the top-N hottest buckets (by count), show which of this probe's + // n-grams fired them, and list every OTHER n-gram that hashes to the + // same bucket. + Integer[] order = new Integer[numBuckets]; + for (int i = 0; i < numBuckets; i++) { + order[i] = i; + } + Arrays.sort(order, Comparator.comparingInt((Integer i) -> -features[i])); + + // Compute which n-grams from THIS probe fired each bucket (with occurrences). + Map<Integer, List<Ngram>> probeFirings = new LinkedHashMap<>(); + enumerateProbeFirings(probe, numBuckets, probeFirings); + + byte[][] weights = model.getWeights(); + float[] scales = model.getScales(); + String[] labels = model.getLabels(); + + int ibm852 = indexOf(labels, "IBM852"); + int win1256 = indexOf(labels, "windows-1256"); + int win1250 = indexOf(labels, "windows-1250"); + + System.out.printf(Locale.ROOT, "Top-%d hottest buckets on this probe:%n", topBuckets); + System.out.println("===================================================================="); + int shown = 0; + for (int rank = 0; rank < numBuckets && shown < topBuckets; rank++) { + int b = order[rank]; + if (features[b] == 0) { + break; + } + shown++; + String ibm852Col = col(weights, scales, b, ibm852, features[b]); + String win1256Col = col(weights, scales, b, win1256, features[b]); + String win1250Col = col(weights, scales, b, win1250, features[b]); + System.out.printf(Locale.ROOT, + "Bucket %5d count %3d IBM852:%s win-1256:%s win-1250:%s%n", + b, features[b], ibm852Col, win1256Col, win1250Col); + List<Ngram> fired = probeFirings.getOrDefault(b, new ArrayList<>()); + List<Ngram> allHere = inverse[b]; + System.out.printf(Locale.ROOT, + " fired by probe (%d distinct ngram kinds):%n", fired.size()); + for (Ngram ng : fired) { + System.out.println(" " + ng.describe()); + } + System.out.printf(Locale.ROOT, + " other n-grams colliding into this bucket (%d total):%n", + allHere == null ? 0 : allHere.size() - fired.size()); + if (allHere != null) { + int samples = 0; + for (Ngram ng : allHere) { + if (containsSame(fired, ng)) { + continue; + } + if (samples++ >= 8) { + break; + } + System.out.println(" " + ng.describe()); + } + } + System.out.println(); + } + } + + private static String col(byte[][] weights, float[] scales, int bucket, + int cls, int count) { + if (cls < 0) { + return "(n/a)"; + } + int w = weights[cls][bucket]; + float raw = scales[cls] * w * count; + return String.format(Locale.ROOT, "w=%+4d raw=%+7.1f", w, raw); + } + + private static int indexOf(String[] labels, String target) { + for (int i = 0; i < labels.length; i++) { + if (labels[i].equalsIgnoreCase(target)) { + return i; + } + } + return -1; + } + + private static boolean containsSame(List<Ngram> list, Ngram ng) { + for (Ngram o : list) { + if (o.equalsNgram(ng)) { + return true; + } + } + return false; + } + + private static LinearModel loadModel(Path modelPath) throws Exception { + if (modelPath != null) { + return LinearModel.loadFromPath(modelPath); + } + String res = "/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin"; + try (InputStream is = BucketCollisionAudit.class.getResourceAsStream(res)) { + if (is == null) { + throw new IllegalStateException("default model resource not found: " + res); + } + return LinearModel.load(is); + } + } + + // ---------------------------------------------------------------------- + // N-gram enumeration and hashing + // ---------------------------------------------------------------------- + + private static int bucket(int hash, int numBuckets) { + return (hash & 0x7fffffff) % numBuckets; + } + + private static int hashUnigram(int bi) { + return (FNV_OFFSET ^ bi) * FNV_PRIME; + } + + private static int hashBigram(int bi, int bi1) { + int h = (FNV_OFFSET ^ bi) * FNV_PRIME; + return (h ^ bi1) * FNV_PRIME; + } + + private static int hashAnchored(int lowTrail, int next) { + int h = (FNV_ANCHOR_SALT ^ lowTrail) * FNV_PRIME; + return (h ^ next) * FNV_PRIME; + } + + private static int hashAnchoredNoTrail(int lowTrail) { + // When the low-trail is the last byte in the probe, anchored bigram + // has no 'next' — the extractor emits just the hash seeded with lowTrail. + return (FNV_ANCHOR_SALT ^ lowTrail) * FNV_PRIME; + } + + private static int hashStride2(int b0, int b1) { + int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; + return (h ^ b1) * FNV_PRIME; + } + + @SuppressWarnings("unchecked") + private static List<Ngram>[] buildInverseMap(int numBuckets) { + List<Ngram>[] inverse = new List[numBuckets]; + + // Unigrams: high bytes only. + for (int bi = 0x80; bi < 0x100; bi++) { + add(inverse, bucket(hashUnigram(bi), numBuckets), Ngram.unigram(bi)); + } + // Bigrams: (high, any). + for (int bi = 0x80; bi < 0x100; bi++) { + for (int bi1 = 0; bi1 < 0x100; bi1++) { + add(inverse, bucket(hashBigram(bi, bi1), numBuckets), + Ngram.bigram(bi, bi1)); + } + } + // Anchored: (low-trail, any) — only fires when preceded by a high byte. + // Hash doesn't include the precursor; two variants depending on whether + // a 'next' byte exists. + for (int bi1 = 0; bi1 < 0x80; bi1++) { + add(inverse, bucket(hashAnchoredNoTrail(bi1), numBuckets), + Ngram.anchoredNoNext(bi1)); + for (int bi2 = 0; bi2 < 0x100; bi2++) { + add(inverse, bucket(hashAnchored(bi1, bi2), numBuckets), + Ngram.anchored(bi1, bi2)); + } + } + // Stride-2: (any, any). + for (int b0 = 0; b0 < 0x100; b0++) { + for (int b1 = 0; b1 < 0x100; b1++) { + add(inverse, bucket(hashStride2(b0, b1), numBuckets), + Ngram.stride2(b0, b1)); + } + } + return inverse; + } + + private static void add(List<Ngram>[] inv, int b, Ngram ng) { + if (inv[b] == null) { + inv[b] = new ArrayList<>(); + } + inv[b].add(ng); + } + + /** + * For a given probe, walk the exact same emission logic as + * {@link ByteNgramFeatureExtractor#extractSparseInto} and record, per + * bucket, which n-gram(s) fired it. This is needed because the + * inverse map gives us the universe of potentially-colliding n-grams, + * and we want to separate "this probe fired it via X" from + * "X' is a colliding peer that didn't fire here." + */ + private static void enumerateProbeFirings(byte[] input, int numBuckets, + Map<Integer, List<Ngram>> firings) { + // Stride-1 + for (int i = 0; i < input.length; i++) { + int bi = input[i] & 0xFF; + if (bi < 0x80) { + continue; + } + addFiring(firings, bucket(hashUnigram(bi), numBuckets), Ngram.unigram(bi)); + if (i + 1 < input.length) { + int bi1 = input[i + 1] & 0xFF; + addFiring(firings, bucket(hashBigram(bi, bi1), numBuckets), + Ngram.bigram(bi, bi1)); + if (bi1 < 0x80) { + if (i + 2 < input.length) { + int bi2 = input[i + 2] & 0xFF; + addFiring(firings, bucket(hashAnchored(bi1, bi2), numBuckets), + Ngram.anchored(bi1, bi2)); + } else { + addFiring(firings, bucket(hashAnchoredNoTrail(bi1), numBuckets), + Ngram.anchoredNoNext(bi1)); + } + } + } + } + // Stride-2 + for (int i = 0; i + 1 < input.length; i += 2) { + int b0 = input[i] & 0xFF; + int b1 = input[i + 1] & 0xFF; + addFiring(firings, bucket(hashStride2(b0, b1), numBuckets), + Ngram.stride2(b0, b1)); + } + } + + private static void addFiring(Map<Integer, List<Ngram>> firings, int b, Ngram ng) { + List<Ngram> list = firings.computeIfAbsent(b, k -> new ArrayList<>()); + for (Ngram o : list) { + if (o.equalsNgram(ng)) { + return; + } + } + list.add(ng); + } + + private static final class Ngram { + final char kind; // 'U' 'B' 'A' 'a' (anchored-no-next) 'S' + final int a; + final int b; + + Ngram(char kind, int a, int b) { + this.kind = kind; + this.a = a; + this.b = b; + } + + static Ngram unigram(int bi) { + return new Ngram('U', bi, -1); + } + + static Ngram bigram(int bi, int bi1) { + return new Ngram('B', bi, bi1); + } + + static Ngram anchored(int low, int next) { + return new Ngram('A', low, next); + } + + static Ngram anchoredNoNext(int low) { + return new Ngram('a', low, -1); + } + + static Ngram stride2(int b0, int b1) { + return new Ngram('S', b0, b1); + } + + boolean equalsNgram(Ngram o) { + return kind == o.kind && a == o.a && b == o.b; + } + + String describe() { + switch (kind) { + case 'U': + return String.format(Locale.ROOT, "UNIGRAM 0x%02X (%s)", + a, letterHint(a)); + case 'B': + return String.format(Locale.ROOT, "BIGRAM 0x%02X 0x%02X (%s, %s)", + a, b, letterHint(a), letterHint(b)); + case 'A': + return String.format(Locale.ROOT, "ANCHORED 0x%02X 0x%02X (%s after high byte)", + a, b, asciiHint(a)); + case 'a': + return String.format(Locale.ROOT, "ANCHOR-L 0x%02X (%s at end after high byte)", + a, asciiHint(a)); + case 'S': + return String.format(Locale.ROOT, "STRIDE2 0x%02X 0x%02X", + a, b); + default: + return "?"; + } + } + + private static String letterHint(int v) { + if (v < 0x80) { + return asciiHint(v); + } + if (v == 0xC7) return "alef[1256]/Ă[852]"; + if (v == 0xE1) return "lam[1256]/ß[852]"; + if (v == 0xE3) return "meem[1256]/Ń[852]"; + if (v == 0xCA) return "teh[1256]/╩[852]"; + if (v == 0xD1) return "reh[1256]/Đ[852]"; + if (v == 0xED) return "yeh[1256]/ý[852]"; + if (v == 0xE7) return "ain[1256]/š[852]"; + if (v == 0xCF) return "ithal[1256]/¤[852]"; + if (v == 0xE4) return "nun[1256]/ń[852]"; + if (v == 0xE6) return "waw[1256]/Š[852]"; + if (v == 0xE9) return "yeh[1256]/Ú[852]"; + if (v == 0xF4) return "fathaton[1256]/─[852]"; + return String.format(Locale.ROOT, "hi-%02X", v); + } + + private static String asciiHint(int v) { + if (v == 0x20) return "SP"; + if (v == 0x0A) return "LF"; + if (v == 0x0D) return "CR"; + if (v >= 0x21 && v <= 0x7E) return "'" + ((char) v) + "'"; + return String.format(Locale.ROOT, "\\x%02X", v); + } + } +} diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java index c2659396d2..88469abab9 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/ConfigurableByteNgramFeatureExtractor.java @@ -65,16 +65,20 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b public static final int GLOBAL_FEATURE_COUNT = 6; private final int numBuckets; - private final int hashBuckets; + private final int stride1Buckets; // size of the stride-1 hash region + private final int stride2Buckets; // size of the stride-2 hash region + private final int stride2Base; // first slot of the stride-2 region + private final int globalBase; // first slot of the globals region (or numBuckets if disabled) private final boolean useUnigrams; private final boolean useBigrams; private final boolean useTrigrams; private final boolean useAnchoredBigrams; private final boolean useStride2Bigrams; private final boolean useGlobalFeatures; + private final boolean useSplitSpaces; /** - * Backwards-compatible constructor (no global features). + * Backwards-compatible constructor (no global features, no split spaces). */ public ConfigurableByteNgramFeatureExtractor(int numBuckets, boolean useUnigrams, @@ -86,13 +90,31 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b useAnchoredBigrams, useStride2Bigrams, false); } + /** + * Constructor with globals support, shared hash space (stride-1 and stride-2 + * mod into the same bucket range). + */ + public ConfigurableByteNgramFeatureExtractor(int numBuckets, + boolean useUnigrams, + boolean useBigrams, + boolean useTrigrams, + boolean useAnchoredBigrams, + boolean useStride2Bigrams, + boolean useGlobalFeatures) { + this(numBuckets, useUnigrams, useBigrams, useTrigrams, + useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures, false); + } + /** * @param numBuckets total feature-vector dimension. When * {@code useGlobalFeatures} is {@code true}, the * last {@link #GLOBAL_FEATURE_COUNT} slots are - * reserved for global features and hashed n-gram - * features mod into the first - * {@code numBuckets - GLOBAL_FEATURE_COUNT} slots. + * reserved for global features. When + * {@code useSplitSpaces} is {@code true}, the + * remaining hash space is split 50/50 between + * stride-1 features and stride-2 features so + * HTML-shaped stride-2 emissions cannot collide + * with single-byte-charset stride-1 weights. * @param useUnigrams emit unigram for each high byte * @param useBigrams emit bigram anchored on each high byte * @param useTrigrams emit trigram anchored on each high byte @@ -100,6 +122,8 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b * @param useStride2Bigrams emit stride-2 bigrams at even positions (all bytes) * @param useGlobalFeatures emit whole-probe global features into the * reserved tail slots (ASCII-density bins) + * @param useSplitSpaces give stride-1 and stride-2 features disjoint + * bucket ranges */ public ConfigurableByteNgramFeatureExtractor(int numBuckets, boolean useUnigrams, @@ -107,17 +131,37 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b boolean useTrigrams, boolean useAnchoredBigrams, boolean useStride2Bigrams, - boolean useGlobalFeatures) { + boolean useGlobalFeatures, + boolean useSplitSpaces) { if (numBuckets <= 0) { throw new IllegalArgumentException("numBuckets must be positive: " + numBuckets); } - if (useGlobalFeatures && numBuckets <= GLOBAL_FEATURE_COUNT) { + int globalsReserved = useGlobalFeatures ? GLOBAL_FEATURE_COUNT : 0; + int hashSpace = numBuckets - globalsReserved; + if (hashSpace <= 0) { throw new IllegalArgumentException( "numBuckets must exceed GLOBAL_FEATURE_COUNT (" + GLOBAL_FEATURE_COUNT + ") when useGlobalFeatures=true: " + numBuckets); } + if (useSplitSpaces && hashSpace < 2) { + throw new IllegalArgumentException( + "useSplitSpaces requires hashSpace >= 2: " + hashSpace); + } this.numBuckets = numBuckets; - this.hashBuckets = useGlobalFeatures ? numBuckets - GLOBAL_FEATURE_COUNT : numBuckets; + this.useSplitSpaces = useSplitSpaces; + if (useSplitSpaces) { + // 50/50 split; stride-1 gets the first half, stride-2 gets the second. + this.stride1Buckets = hashSpace / 2; + this.stride2Buckets = hashSpace - this.stride1Buckets; + this.stride2Base = this.stride1Buckets; + } else { + // Both stride families share the same hash region [0, hashSpace). + this.stride1Buckets = hashSpace; + this.stride2Buckets = hashSpace; + this.stride2Base = 0; + } + // Globals region always starts immediately after the hash region(s). + this.globalBase = hashSpace; this.useUnigrams = useUnigrams; this.useBigrams = useBigrams; this.useTrigrams = useTrigrams; @@ -211,7 +255,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useUnigrams) { int h = (FNV_OFFSET ^ bi) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % hashBuckets; + int bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -224,7 +268,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useBigrams) { int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % hashBuckets; + int bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -236,7 +280,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (i + 2 < input.length) { h = (h ^ (input[i + 2] & 0xFF)) * FNV_PRIME; } - int bkt = (h & 0x7fffffff) % hashBuckets; + int bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -248,7 +292,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; h = (h ^ bi2) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % hashBuckets; + int bkt = stride1Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -264,7 +308,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int b1 = input[i + 1] & 0xFF; int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; h = (h ^ b1) * FNV_PRIME; - int bkt = (h & 0x7fffffff) % hashBuckets; + int bkt = stride2Bucket(h); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -274,7 +318,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b // Global features at reserved tail slots: fire exactly one ASCII-density bin. if (useGlobalFeatures) { - int bkt = hashBuckets + asciiDensityBin(input); + int bkt = globalBase + asciiDensityBin(input); if (dense[bkt] == 0) { touched[n++] = bkt; } @@ -293,7 +337,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b } if (useUnigrams) { - counts[bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++; + counts[stride1Bucket((FNV_OFFSET ^ bi) * FNV_PRIME)]++; } if (i + 1 < to) { @@ -302,7 +346,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useBigrams) { int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; - counts[bucket(h)]++; + counts[stride1Bucket(h)]++; } if (useAnchoredBigrams && bi1 < 0x80) { @@ -310,7 +354,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (i + 2 < to) { h = (h ^ (b[i + 2] & 0xFF)) * FNV_PRIME; } - counts[bucket(h)]++; + counts[stride1Bucket(h)]++; } if (useTrigrams && i + 2 < to) { @@ -318,7 +362,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int h = (FNV_OFFSET ^ bi) * FNV_PRIME; h = (h ^ bi1) * FNV_PRIME; h = (h ^ bi2) * FNV_PRIME; - counts[bucket(h)]++; + counts[stride1Bucket(h)]++; } } } @@ -330,7 +374,7 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b int b1 = b[i + 1] & 0xFF; int h = (FNV_STRIDE2_SALT ^ b0) * FNV_PRIME; h = (h ^ b1) * FNV_PRIME; - counts[bucket(h)]++; + counts[stride2Bucket(h)]++; } } @@ -338,12 +382,16 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b if (useGlobalFeatures) { byte[] slice = (from == 0 && to == b.length) ? b : java.util.Arrays.copyOfRange(b, from, to); - counts[hashBuckets + asciiDensityBin(slice)]++; + counts[globalBase + asciiDensityBin(slice)]++; } } - private int bucket(int hash) { - return (hash & 0x7fffffff) % hashBuckets; + private int stride1Bucket(int hash) { + return (hash & 0x7fffffff) % stride1Buckets; + } + + private int stride2Bucket(int hash) { + return stride2Base + (hash & 0x7fffffff) % stride2Buckets; } @Override @@ -351,11 +399,18 @@ public class ConfigurableByteNgramFeatureExtractor implements FeatureExtractor<b return numBuckets; } + public boolean isUseSplitSpaces() { + return useSplitSpaces; + } + @Override public String toString() { return String.format(java.util.Locale.ROOT, - "ConfigurableByteNgramFeatureExtractor{buckets=%d, hash=%d, uni=%b, bi=%b, tri=%b, anchored=%b, stride2=%b, globals=%b}", - numBuckets, hashBuckets, useUnigrams, useBigrams, useTrigrams, - useAnchoredBigrams, useStride2Bigrams, useGlobalFeatures); + "ConfigurableByteNgramFeatureExtractor{buckets=%d, stride1=[0,%d) stride2=[%d,%d) globals=[%d,%d)" + + " uni=%b, bi=%b, tri=%b, anchored=%b, stride2f=%b, globalsf=%b, split=%b}", + numBuckets, stride1Buckets, stride2Base, stride2Base + stride2Buckets, + globalBase, numBuckets, + useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, + useStride2Bigrams, useGlobalFeatures, useSplitSpaces); } } diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java new file mode 100644 index 0000000000..dfe13b3ade --- /dev/null +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TraceCharsetLogits.java @@ -0,0 +1,380 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect.tools; + +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Comparator; +import java.util.List; +import java.util.Locale; + +import org.apache.tika.ml.FeatureExtractor; +import org.apache.tika.ml.LinearModel; +import org.apache.tika.ml.chardetect.ByteNgramFeatureExtractor; + +/** + * Forensic trace for a single probe: top-15 raw logits, per-class bucket + * contribution breakdown, and probe statistics. Helps diagnose cases where + * the model is confidently wrong (e.g. the Arabic-vs-IBM852 rank-15 case). + * + * <p>Usage: + * <pre> + * java TraceCharsetLogits --probe <file> [--model <path>] + * [--focus label1,label2,...] [--top-buckets N] + * [--max-probe-bytes N] + * </pre> + */ +public final class TraceCharsetLogits { + + private TraceCharsetLogits() { + } + + public static void main(String[] args) throws Exception { + Path probePath = null; + Path modelPath = null; + List<String> focus = new ArrayList<>(); + int topBuckets = 20; + int maxProbeBytes = 32 * 1024; + boolean noStride2 = false; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--probe": + probePath = Paths.get(args[++i]); + break; + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--focus": + for (String s : args[++i].split(",")) { + focus.add(s.trim()); + } + break; + case "--top-buckets": + topBuckets = Integer.parseInt(args[++i]); + break; + case "--max-probe-bytes": + maxProbeBytes = Integer.parseInt(args[++i]); + break; + case "--no-stride2": + noStride2 = true; + break; + default: + System.err.println("Unknown arg: " + args[i]); + System.exit(1); + } + } + if (probePath == null) { + System.err.println("Usage: TraceCharsetLogits --probe <file> [--model <path>] " + + "[--focus <label1,label2,...>] [--top-buckets N] [--max-probe-bytes N]"); + System.exit(1); + } + + LinearModel model = loadModel(modelPath); + FeatureExtractor<byte[]> extractor = noStride2 + // Production flags minus stride-2, matching FeatureExtractorParityTest + // for the stride-1 features (uni + bi, no trigrams, no anchored). + ? new ConfigurableByteNgramFeatureExtractor(model.getNumBuckets(), + true, true, false, false, false) + : new ByteNgramFeatureExtractor(model.getNumBuckets()); + if (noStride2) { + System.out.println("Stride-2 features suppressed for this run."); + } + + byte[] allBytes = Files.readAllBytes(probePath); + byte[] probe = allBytes.length <= maxProbeBytes + ? allBytes + : Arrays.copyOf(allBytes, maxProbeBytes); + + printProbeStats(probePath, allBytes.length, probe); + + int[] features = extractor.extract(probe); + float[] logits = model.predictLogits(features); + + String[] labels = model.getLabels(); + int numClasses = labels.length; + + // Top-15 by raw logit + Integer[] order = new Integer[numClasses]; + for (int i = 0; i < numClasses; i++) { + order[i] = i; + } + Arrays.sort(order, Comparator.comparingDouble((Integer i) -> -logits[i])); + + System.out.println(); + System.out.println("Top-15 raw logits:"); + System.out.println(" rank label logit gap-from-top"); + float topLogit = logits[order[0]]; + for (int r = 0; r < Math.min(15, numClasses); r++) { + int c = order[r]; + System.out.printf(Locale.ROOT, + " %3d %-24s %10.1f %+10.1f%n", + r + 1, labels[c], logits[c], logits[c] - topLogit); + } + + // Per-class bucket contribution breakdown for top-1 and any --focus classes + List<String> forensic = new ArrayList<>(); + forensic.add(labels[order[0]]); + for (String f : focus) { + if (!forensic.contains(f)) { + forensic.add(f); + } + } + + byte[][] weights = model.getWeights(); + float[] scales = model.getScales(); + float[] biases = model.getBiases(); + int numBuckets = model.getNumBuckets(); + + for (String label : forensic) { + int c = indexOf(labels, label); + if (c < 0) { + System.out.println(); + System.out.println("(label '" + label + "' not in model)"); + continue; + } + System.out.println(); + System.out.printf(Locale.ROOT, "Per-bucket contributions for %s (class %d, bias=%.2f, scale=%.4g):%n", + label, c, biases[c], scales[c]); + + float clip = 1.5f * (float) Math.sqrt(nnz(features)); + + BucketContrib[] contribs = new BucketContrib[numBuckets]; + int nContribs = 0; + for (int b = 0; b < numBuckets; b++) { + if (features[b] == 0) { + continue; + } + float raw = scales[c] * weights[c][b] * features[b]; + float clipped = Math.max(-clip, Math.min(clip, raw)); + contribs[nContribs++] = new BucketContrib(b, features[b], weights[c][b], + raw, clipped); + } + BucketContrib[] trim = Arrays.copyOf(contribs, nContribs); + Arrays.sort(trim, (a, bb) -> Float.compare(Math.abs(bb.clipped), Math.abs(a.clipped))); + + double sumClipped = 0, sumRaw = 0; + for (BucketContrib bc : trim) { + sumClipped += bc.clipped; + sumRaw += bc.raw; + } + System.out.printf(Locale.ROOT, + " active buckets: %d sum(clipped)=%.1f sum(raw)=%.1f bias=%.2f " + + "logit=%.1f clip=%.2f%n", + nContribs, sumClipped, sumRaw, biases[c], + sumClipped + biases[c], clip); + + System.out.printf(Locale.ROOT, + " top-%d buckets by |clipped contribution|:%n", topBuckets); + System.out.println(" bucket count weight(INT8) raw clipped"); + for (int k = 0; k < Math.min(topBuckets, trim.length); k++) { + BucketContrib bc = trim[k]; + System.out.printf(Locale.ROOT, + " %7d %5d %+5d %+10.2f %+10.2f%n", + bc.bucket, bc.count, bc.weight, bc.raw, bc.clipped); + } + } + + // For any pair of focus classes (or top-1 + first focus), show shared buckets. + if (forensic.size() >= 2) { + String a = forensic.get(0); + String b = forensic.get(1); + int ca = indexOf(labels, a); + int cb = indexOf(labels, b); + if (ca >= 0 && cb >= 0) { + System.out.println(); + System.out.printf(Locale.ROOT, + "Head-to-head bucket comparison: %s vs %s%n", a, b); + System.out.println(" bucket count wA wB raw-diff " + + "(wA-wB)*scale*count ~ net logit delta for A over B"); + float scA = scales[ca]; + float scB = scales[cb]; + List<BucketDiff> diffs = new ArrayList<>(); + for (int bk = 0; bk < numBuckets; bk++) { + if (features[bk] == 0) { + continue; + } + float rawA = scA * weights[ca][bk] * features[bk]; + float rawB = scB * weights[cb][bk] * features[bk]; + float diff = rawA - rawB; + diffs.add(new BucketDiff(bk, features[bk], + weights[ca][bk], weights[cb][bk], rawA, rawB, diff)); + } + diffs.sort((x, y) -> Float.compare(Math.abs(y.diff), Math.abs(x.diff))); + for (int k = 0; k < Math.min(topBuckets, diffs.size()); k++) { + BucketDiff d = diffs.get(k); + System.out.printf(Locale.ROOT, + " %7d %5d %+4d %+4d %+10.2f %+10.2f%n", + d.bucket, d.count, d.wA, d.wB, d.rawA - d.rawB, d.diff); + } + } + } + } + + private static int nnz(int[] features) { + int n = 0; + for (int v : features) { + if (v != 0) { + n++; + } + } + return n; + } + + private static int indexOf(String[] labels, String target) { + for (int i = 0; i < labels.length; i++) { + if (labels[i].equalsIgnoreCase(target)) { + return i; + } + } + return -1; + } + + private static LinearModel loadModel(Path modelPath) throws Exception { + if (modelPath != null) { + return LinearModel.loadFromPath(modelPath); + } + // Default: the model shipped with mojibuster. + String res = "/org/apache/tika/ml/chardetect/chardetect-v6-no-utf32.bin"; + try (InputStream is = TraceCharsetLogits.class.getResourceAsStream(res)) { + if (is == null) { + throw new IllegalStateException("default model resource not found: " + res); + } + return LinearModel.load(is); + } + } + + private static void printProbeStats(Path p, long fileSize, byte[] probe) { + int[] hist = new int[256]; + int high = 0, c1 = 0, nul = 0, ascii = 0, asciiText = 0; + for (byte b : probe) { + int v = b & 0xFF; + hist[v]++; + if (v >= 0x80) { + high++; + } + if (v >= 0x80 && v < 0xA0) { + c1++; + } + if (v == 0) { + nul++; + } + if (v < 0x80) { + ascii++; + } + if ((v >= 0x20 && v <= 0x7E) || v == 0x09 || v == 0x0A || v == 0x0D) { + asciiText++; + } + } + System.out.println("Probe trace"); + System.out.printf(Locale.ROOT, " file : %s%n", p); + System.out.printf(Locale.ROOT, " file size : %,d bytes (probe: %,d)%n", fileSize, probe.length); + System.out.printf(Locale.ROOT, + " high bytes : %,d (%.2f%%) ASCII: %,d (%.2f%%) ASCII-text: %,d (%.2f%%)%n", + high, 100.0 * high / probe.length, + ascii, 100.0 * ascii / probe.length, + asciiText, 100.0 * asciiText / probe.length); + System.out.printf(Locale.ROOT, + " C1 (0x80-9F) : %,d (%.2f%%) NUL: %,d%n", + c1, 100.0 * c1 / probe.length, nul); + + // High-byte range distribution + int[] ranges = new int[4]; // 0x80-BF, 0xC0-DF, 0xE0-EF, 0xF0-FF + for (int v = 0x80; v < 0x100; v++) { + int bucket; + if (v < 0xC0) { + bucket = 0; + } else if (v < 0xE0) { + bucket = 1; + } else if (v < 0xF0) { + bucket = 2; + } else { + bucket = 3; + } + ranges[bucket] += hist[v]; + } + int highTotal = ranges[0] + ranges[1] + ranges[2] + ranges[3]; + if (highTotal > 0) { + System.out.printf(Locale.ROOT, + " high ranges : 0x80-BF=%.1f%% 0xC0-DF=%.1f%% 0xE0-EF=%.1f%% 0xF0-FF=%.1f%%%n", + 100.0 * ranges[0] / highTotal, + 100.0 * ranges[1] / highTotal, + 100.0 * ranges[2] / highTotal, + 100.0 * ranges[3] / highTotal); + } + + // Top 10 most frequent high-byte values + Integer[] idx = new Integer[256]; + for (int i = 0; i < 256; i++) { + idx[i] = i; + } + Arrays.sort(idx, (a, b) -> Integer.compare(hist[b], hist[a])); + StringBuilder sb = new StringBuilder(" top high bytes: "); + int shown = 0; + for (int i : idx) { + if (shown >= 10 || hist[i] == 0) { + break; + } + if (i < 0x80) { + continue; + } + sb.append(String.format(Locale.ROOT, "0x%02X(%d) ", i, hist[i])); + shown++; + } + System.out.println(sb); + } + + private static final class BucketContrib { + final int bucket; + final int count; + final byte weight; + final float raw; + final float clipped; + + BucketContrib(int bucket, int count, byte weight, float raw, float clipped) { + this.bucket = bucket; + this.count = count; + this.weight = weight; + this.raw = raw; + this.clipped = clipped; + } + } + + private static final class BucketDiff { + final int bucket; + final int count; + final byte wA; + final byte wB; + final float rawA; + final float rawB; + final float diff; + + BucketDiff(int bucket, int count, byte wA, byte wB, float rawA, float rawB, float diff) { + this.bucket = bucket; + this.count = count; + this.wA = wA; + this.wB = wB; + this.rawA = rawA; + this.rawB = rawB; + this.diff = diff; + } + } +} diff --git a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java index 7a38d3bce9..1e7a7e5cdf 100644 --- a/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java +++ b/tika-ml/tika-ml-chardetect/src/main/java/org/apache/tika/ml/chardetect/tools/TrainCharsetModel.java @@ -82,6 +82,7 @@ public class TrainCharsetModel { boolean useAnchoredBigrams = false; boolean useStride2Bigrams = true; boolean useGlobalFeatures = false; + boolean useSplitSpaces = false; // --label-remap src1:dst1,src2:dst2 — merges multiple source labels into // one target label at training time (e.g. merge script variants into one class). Map<String, String> labelRemap = new HashMap<>(); @@ -147,6 +148,12 @@ public class TrainCharsetModel { case "--no-globals": useGlobalFeatures = false; break; + case "--split-spaces": + useSplitSpaces = true; + break; + case "--no-split-spaces": + useSplitSpaces = false; + break; case "--exclude": for (String label : args[++i].split(",")) { excludeLabels.add(label.trim()); @@ -172,6 +179,7 @@ public class TrainCharsetModel { System.err.println(" --anchored / --no-anchored anchored bigrams (default: off)"); System.err.println(" --stride2 / --no-stride2 stride-2 bigrams at even positions (default: on)"); System.err.println(" --globals / --no-globals emit global ASCII-density bin features (default: off)"); + System.err.println(" --split-spaces / --no-split-spaces give stride-1 and stride-2 features disjoint bucket ranges (default: off)"); System.err.println(" --exclude cs1,cs2 skip these charset labels (e.g. UTF-32-BE,UTF-32-LE)"); System.exit(1); } @@ -219,14 +227,14 @@ public class TrainCharsetModel { "Buckets: %d epochs: %d lr: %.4f max-samples/class: %d%n", numBuckets, epochs, lr, maxSamplesPerClass); System.out.printf(java.util.Locale.ROOT, - "Features: uni=%b bi=%b tri=%b anchored=%b stride2=%b globals=%b%n", + "Features: uni=%b bi=%b tri=%b anchored=%b stride2=%b globals=%b split=%b%n", useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, useStride2Bigrams, - useGlobalFeatures); + useGlobalFeatures, useSplitSpaces); ConfigurableByteNgramFeatureExtractor extractor = new ConfigurableByteNgramFeatureExtractor(numBuckets, useUnigrams, useBigrams, useTrigrams, useAnchoredBigrams, - useStride2Bigrams, useGlobalFeatures); + useStride2Bigrams, useGlobalFeatures, useSplitSpaces); // Build class index map Map<String, Integer> labelIndex = new HashMap<>(); diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java index c40ef78075..3958d86d81 100644 --- a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java +++ b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/ConfigurableGlobalFeatureTest.java @@ -157,4 +157,77 @@ public class ConfigurableGlobalFeatureTest { "bucket " + i + " differs between dense and sparse paths"); } } + + // --- split-space layout --- + + private static final int SPLIT_NUM_BUCKETS = 32768 + ConfigurableByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; + + private static ConfigurableByteNgramFeatureExtractor withSplitAndGlobals() { + return new ConfigurableByteNgramFeatureExtractor( + SPLIT_NUM_BUCKETS, true, true, false, false, true, true, true); + } + + @Test + public void splitSpacesStride1FiresOnlyLowRegion() { + ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); + int[] dense = new int[SPLIT_NUM_BUCKETS]; + int[] touched = new int[SPLIT_NUM_BUCKETS]; + // High bytes only — fires stride-1 unigrams + bigrams + stride-2 pairs + byte[] probe = new byte[]{(byte) 0xE4, (byte) 0xF6, (byte) 0xFC}; + int n = ext.extractSparseInto(probe, dense, touched); + + // stride-1 firings must be in [0, 16384), stride-2 in [16384, 32768), + // globals in [32768, 32774). + int stride1Count = 0; + int stride2Count = 0; + int globalCount = 0; + for (int i = 0; i < n; i++) { + int bkt = touched[i]; + if (bkt < 16384) { + stride1Count++; + } else if (bkt < 32768) { + stride2Count++; + } else { + globalCount++; + } + } + assertTrue(stride1Count > 0, "expected stride-1 firings in low region"); + assertTrue(stride2Count > 0, "expected stride-2 firings in high region"); + assertEquals(1, globalCount, "exactly one global bin fires"); + } + + @Test + public void splitSpacesAsciiProbeFiresOnlyStride2AndGlobals() { + ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); + int[] dense = new int[SPLIT_NUM_BUCKETS]; + int[] touched = new int[SPLIT_NUM_BUCKETS]; + // Pure ASCII — no stride-1 firings (no high bytes), all firings are + // stride-2 (HTML markup-shaped pairs) + the globals bin. + byte[] probe = "Hello, world! This is ASCII only.\r\n" + .getBytes(StandardCharsets.US_ASCII); + int n = ext.extractSparseInto(probe, dense, touched); + + for (int i = 0; i < n; i++) { + int bkt = touched[i]; + assertTrue(bkt >= 16384, + "ASCII probe must NOT fire any stride-1 slot, got bkt=" + bkt); + } + } + + @Test + public void splitSpacesDenseSparseAgree() { + ConfigurableByteNgramFeatureExtractor ext = withSplitAndGlobals(); + byte[] probe = "r\u00E9sum\u00E9 caf\u00E9" + .getBytes(StandardCharsets.ISO_8859_1); + + int[] dense = ext.extract(probe); + int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; + int[] touched = new int[SPLIT_NUM_BUCKETS]; + ext.extractSparseInto(probe, sparseDense, touched); + + for (int i = 0; i < SPLIT_NUM_BUCKETS; i++) { + assertEquals(dense[i], sparseDense[i], + "bucket " + i + " differs between dense and sparse paths (split layout)"); + } + } } diff --git a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java index d2de48f423..900a5dbb5c 100644 --- a/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java +++ b/tika-ml/tika-ml-chardetect/src/test/java/org/apache/tika/ml/chardetect/FeatureExtractorParityTest.java @@ -254,4 +254,101 @@ public class FeatureExtractorParityTest { assertArrayEquals(dense, sparseDense, "ByteNgramFeatureExtractor: extract() vs extractSparseInto() differ"); } + + // ===================================================================== + // Parity in the split-spaces + globals layout (next-generation model). + // ===================================================================== + + private static final int SPLIT_NUM_BUCKETS = + 32768 + ByteNgramFeatureExtractor.GLOBAL_FEATURE_COUNT; + + private final ByteNgramFeatureExtractor productionSplit = + new ByteNgramFeatureExtractor(SPLIT_NUM_BUCKETS, true, true); + + private final ConfigurableByteNgramFeatureExtractor configurableSplit = + new ConfigurableByteNgramFeatureExtractor( + SPLIT_NUM_BUCKETS, + true, // unigrams + true, // bigrams + false, // trigrams OFF + false, // anchored OFF + true, // stride2 ON + true, // globals ON + true); // split spaces ON + + private void assertSplitParity(byte[] probe) { + int[] prodFeatures = productionSplit.extract(probe); + int[] confFeatures = configurableSplit.extract(probe); + assertEquals(prodFeatures.length, confFeatures.length, + "split-layout feature vector lengths differ"); + for (int i = 0; i < prodFeatures.length; i++) { + if (prodFeatures[i] != confFeatures[i]) { + org.junit.jupiter.api.Assertions.fail(String.format( + "split-layout bucket %d: production=%d, configurable=%d", + i, prodFeatures[i], confFeatures[i])); + } + } + } + + @Test + public void splitParityOnPureAscii() { + assertSplitParity("Hello, world! This is ASCII text.\r\n" + .getBytes(StandardCharsets.US_ASCII)); + } + + @Test + public void splitParityOnHighByteContent() { + assertSplitParity(new byte[]{ + (byte) 0x72, (byte) 0xE9, (byte) 0x73, (byte) 0x75, + (byte) 0x6D, (byte) 0xE9, (byte) 0x20, + (byte) 0x63, (byte) 0x61, (byte) 0x66, (byte) 0xE9 + }); + } + + @Test + public void splitParityOnRealUtf16Le() { + assertSplitParity("日本語テスト".getBytes(StandardCharsets.UTF_16LE)); + } + + @Test + public void splitParityOnArabicLike() { + // Synthesized Arabic-style byte pattern: 0xC7/0xE1/0xE3 alef/lam/meem + byte[] probe = new byte[]{ + (byte) 0xC7, (byte) 0xE1, (byte) 0xE3, 0x20, + (byte) 0xD9, (byte) 0xED, (byte) 0xC7, (byte) 0xE1, + (byte) 0xCA, (byte) 0xD1, 0x0D, 0x0A + }; + assertSplitParity(probe); + } + + @Test + public void splitParityOnLongMixedProbe() { + byte[] probe = new byte[4096]; + for (int i = 0; i < probe.length; i++) { + probe[i] = (byte) ((i % 3 == 0) ? (0x80 + (i % 128)) : (0x20 + (i % 96))); + } + assertSplitParity(probe); + } + + @Test + public void splitLayoutProductionDenseMatchesSparse() { + byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE); + int[] dense = productionSplit.extract(probe); + int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; + int[] touched = new int[SPLIT_NUM_BUCKETS]; + productionSplit.extractSparseInto(probe, sparseDense, touched); + assertArrayEquals(dense, sparseDense, + "split layout: production extract() vs extractSparseInto() differ"); + } + + @Test + public void splitLayoutConfigurableDenseMatchesSparse() { + byte[] probe = "日本語テスト résumé".getBytes(StandardCharsets.UTF_16LE); + int[] dense = configurableSplit.extract(probe); + int[] sparseDense = new int[SPLIT_NUM_BUCKETS]; + int[] touched = new int[SPLIT_NUM_BUCKETS]; + configurableSplit.extractSparseInto(probe, sparseDense, touched); + assertArrayEquals(dense, sparseDense, + "split layout: configurable extract() vs extractSparseInto() differ"); + } }
