(tika) 01/05: wip initial commit

tallison Fri, 13 Mar 2026 19:17:30 -0700

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4690-add-generative-models
in repository https://gitbox.apache.org/repos/asf/tika.git


commit 975db37821b4701669a747ef6caf5a453f4dd5e8
Author: tballison <[email protected]>
AuthorDate: Tue Mar 10 14:38:29 2026 -0400

    wip initial commit
---
 .../ROOT/pages/advanced/charset-detection-eval.txt | 260 ++++++++
 .../chardetect/CharsetDetectionRegressionTest.java | 162 +++++
 .../charsoup/GenerativeLanguageModel.java          | 708 +++++++++++++++++++++
 .../charsoup/ScriptAwareFeatureExtractor.java      |   2 +-
 .../charsoup/tools/CorpusDiversityAnalyzer.java    | 274 ++++++++
 .../charsoup/tools/CorpusFilterReport.java         | 253 ++++++++
 .../charsoup/tools/EvalGenerativeModel.java        | 365 +++++++++++
 .../langdetect/charsoup/tools/FilterBenchmark.java | 228 +++++++
 .../charsoup/tools/LengthCalibrationReport.java    | 171 +++++
 .../tools/TrainGenerativeLanguageModel.java        | 407 ++++++++++++
 .../tika/config/TikaEncodingDetectorTest.java      |  30 +
 11 files changed, 2859 insertions(+), 1 deletion(-)

diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt 
b/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt
new file mode 100644
index 0000000000..2a71a78a28
--- /dev/null
+++ b/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt
@@ -0,0 +1,260 @@
+
+=== Probe length: 20B ===
+                            N  | --- ML ablation 
--------------------------------- | --- Baselines ----------------------- |
+Charset                        | Stat R%   S%  | +ISO R%   S%  | +CJK R%   S%  
| All  R%   S%  | ICU4J R%   S%  | juniv R%   S%  |
+----------------------------------------------------------------------------------------------------------------------------------
+Big5-HKSCS               5000  |  98.0  98.0  |  97.9  97.9  |  97.9  97.9  |  
97.9  97.9  |   0.0   9.8  |   0.0  37.9  |
+EUC-JP                   5000  |  54.5  54.5  |  54.6  54.6  |  54.7  54.7  |  
54.7  54.7  |   0.0   0.0  |  69.7  69.7  |
+EUC-KR                   5000  |  66.4  66.4  |  67.6  67.6  |  67.6  67.6  |  
67.6  67.6  |   0.0   0.0  |  78.3  78.3  |
+GB18030                  5000  |  29.8  29.8  |  30.7  30.7  |  30.8  30.8  |  
30.8  30.8  |   0.1   0.1  |  40.6  40.6  |
+IBM1047                 10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  86.8  |   0.0   0.0  |
+IBM420-ltr              10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  80.9  |   0.0   0.0  |
+IBM420-rtl              10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  84.2  |   0.0   0.0  |
+IBM424-ltr               6195  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  69.1  |   0.0   0.0  |
+IBM424-rtl               4717  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  69.6  |   0.0   0.0  |
+IBM437                   7516  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM500                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  87.7  87.7  |   0.0   0.0  |
+IBM850                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM852                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM855                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |  90.7  90.7  |
+IBM866                   8442  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  43.0  43.0  |  93.2  93.2  |
+ISO-2022-CN              5000  |   0.0   0.0  |  94.6  94.6  |  94.6  94.6  |  
94.6  94.6  |  87.7  87.7  |   0.0   0.0  |
+ISO-2022-JP              5000  |   0.0   0.0  |  94.0  94.0  |  94.0  94.0  |  
94.0  94.0  |  77.9  77.9  |  93.9  93.9  |
+ISO-2022-KR              5000  |   0.0   0.0  |  94.4  94.4  |  94.4  94.4  |  
94.4  94.4  |  92.6  92.6  |  94.4  94.4  |
+ISO-8859-16             10000  |  38.3  38.3  |  31.6  31.6  |  31.7  31.7  |  
31.7  31.7  |   0.0   0.0  |   0.0   0.0  |
+ISO-8859-3               5195  |  10.0  10.0  |  10.3  10.3  |  10.5  10.5  |  
10.5  10.5  |   0.0   0.0  |   0.0   0.0  |
+KOI8-R                   8411  |  57.0  66.6  |  57.0  66.6  |  57.1  66.7  |  
57.1  66.7  |  59.7  59.7  |  94.6  94.6  |
+KOI8-U                   5921  |  71.2  79.3  |  71.3  79.5  |  71.3  79.6  |  
71.3  79.6  |   0.0  56.6  |   0.0  95.4  |
+Shift_JIS                5000  |  84.3  84.3  |  86.6  86.6  |  86.6  86.6  |  
86.6  86.6  |   0.0   0.0  |  71.6  71.6  |
+US-ASCII                 5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   2.3  |   0.0   0.0  |
+UTF-16-BE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  68.7  68.7  |   0.0   0.0  |
+UTF-16-LE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  69.4  69.4  |   0.0   0.0  |
+UTF-32-BE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  | 100.0 100.0  |   0.0   0.0  |
+UTF-32-LE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  | 100.0 100.0  |   0.0   0.0  |
+UTF-8                    5000  |  61.4  61.4  |  61.1  61.1  |  61.5  61.5  |  
61.5  61.5  |  77.6  77.6  |  78.2  78.2  |
+windows-1250            10000  |   4.7   4.7  |   4.8   4.8  |   4.9   4.9  |  
 4.9   4.9  |   9.3  48.3  |   0.0   0.0  |
+windows-1251            10000  |  82.4  82.4  |  82.5  82.5  |  82.6  82.6  |  
82.6  82.6  |  56.1  56.4  |  71.4  71.5  |
+windows-1252            10000  |   2.6   2.6  |  50.0  50.0  |  50.1  50.1  |  
50.1  50.1  |   4.5  70.0  |   0.0  98.7  |
+windows-1253            10000  |  55.2  55.2  |  55.4  55.4  |  55.5  55.5  |  
55.5  55.5  |   1.9  63.3  |   0.1  80.8  |
+windows-1254            10000  |  35.8  35.8  |  35.8  35.8  |  35.9  35.9  |  
35.9  35.9  |   4.9  52.1  |   0.0   0.0  |
+windows-1255            10000  |  86.7  86.7  |  86.7  86.7  |  86.7  86.7  |  
86.7  86.7  |   3.5  27.2  |  93.9  96.2  |
+windows-1256            10000  |  94.1  94.1  |  94.2  94.2  |  94.2  94.2  |  
94.2  94.2  |  70.2  78.6  |   0.0   0.0  |
+windows-1257            10000  |  18.8  18.8  |  17.0  17.0  |  17.1  17.1  |  
17.1  17.1  |   0.0   0.0  |   0.0   0.0  |
+windows-1258            10000  |  63.0  63.0  |  63.2  63.2  |  63.3  63.3  |  
63.3  63.3  |   0.0   0.0  |   0.0   0.0  |
+windows-874             10000  |  79.3  79.3  |  80.6  80.6  |  81.4  81.4  |  
81.4  81.4  |   0.0   0.0  |   0.0   0.0  |
+x-EUC-TW                 5000  |  99.4  99.4  |  98.3  98.3  |  98.3  98.3  |  
98.3  98.3  |   0.0   0.0  |  38.7  38.7  |
+x-ISO-2022-CN-CNS        5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+x-MacRoman              10000  |   6.1   6.1  |   6.2   6.2  |   6.3   6.3  |  
 6.3   6.3  |   0.0   0.0  |   0.0   0.0  |
+x-mac-cyrillic          10000  |  68.8  68.8  |  69.2  69.2  |  69.3  69.3  |  
69.3  69.3  |   0.0   0.0  |  49.3  49.3  |
+------------------------------------------------------------------------------------------------------------------------
+OVERALL                 326397  |  30.0  30.4  |  35.6  36.0  |  35.7  36.1  | 
 35.7  36.1  |  20.3  39.0  |  22.9  30.7  |
+  Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | 
R%=strict | S%=soft
+  µs/sample                    |        10.0  |         7.3  |         7.4  |  
       7.0  |        11.9  |         3.3  |
+
+=== Probe length: 50B ===
+                            N  | --- ML ablation 
--------------------------------- | --- Baselines ----------------------- |
+Charset                        | Stat R%   S%  | +ISO R%   S%  | +CJK R%   S%  
| All  R%   S%  | ICU4J R%   S%  | juniv R%   S%  |
+----------------------------------------------------------------------------------------------------------------------------------
+Big5-HKSCS               5000  |  99.4  99.4  |  99.3  99.3  |  99.3  99.3  |  
99.3  99.3  |   0.0  95.9  |   0.0  69.1  |
+EUC-JP                   5000  |  88.9  88.9  |  89.0  89.0  |  89.0  89.0  |  
89.0  89.0  |  86.7  86.7  |  90.2  90.2  |
+EUC-KR                   5000  |  92.7  92.7  |  93.3  93.3  |  93.3  93.3  |  
93.3  93.3  |  89.4  89.4  |  96.0  96.0  |
+GB18030                  5000  |  65.0  65.0  |  66.4  66.4  |  66.5  66.5  |  
66.5  66.5  |  71.8  71.8  |  83.2  83.2  |
+IBM1047                 10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  80.2  |   0.0   0.0  |
+IBM420-ltr              10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  93.1  |   0.0   0.0  |
+IBM420-rtl              10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  94.7  |   0.0   0.0  |
+IBM424-ltr               6195  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  86.1  |   0.0   0.0  |
+IBM424-rtl               4717  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0  87.6  |   0.0   0.0  |
+IBM437                   7516  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM500                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  80.2  80.2  |   0.0   0.0  |
+IBM850                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM852                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+IBM855                  10000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |  97.0  97.0  |
+IBM866                   8442  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  62.0  62.0  |  98.1  98.1  |
+ISO-2022-CN              5000  |   0.0   0.0  |  98.6  98.6  |  98.6  98.6  |  
98.6  98.6  |  98.5  98.5  |   0.0   0.0  |
+ISO-2022-JP              5000  |   0.0   0.0  |  97.4  97.4  |  97.4  97.4  |  
97.4  97.4  |  93.9  93.9  |  97.3  97.3  |
+ISO-2022-KR              5000  |   0.0   0.0  |  98.9  98.9  |  98.9  98.9  |  
98.9  98.9  |  98.8  98.8  |  98.9  98.9  |
+ISO-8859-16             10000  |  89.3  89.3  |  80.5  80.5  |  80.7  80.7  |  
80.7  80.7  |   0.0   0.0  |   0.0   0.0  |
+ISO-8859-3               5195  |  50.8  50.8  |  49.8  49.8  |  49.9  49.9  |  
49.9  49.9  |   0.0   0.0  |   0.0   0.0  |
+KOI8-R                   8411  |  86.2  93.6  |  86.2  93.7  |  86.2  93.7  |  
86.2  93.7  |  77.6  77.6  |  98.6  98.6  |
+KOI8-U                   5921  |  92.7  97.4  |  92.7  97.4  |  92.7  97.5  |  
92.7  97.5  |   0.0  74.8  |   0.0  98.2  |
+Shift_JIS                5000  |  95.3  95.3  |  96.1  96.1  |  96.1  96.1  |  
96.1  96.1  |  88.5  88.5  |  94.5  94.5  |
+US-ASCII                 5000  |   0.0   0.5  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.3  |   0.0   0.0  |
+UTF-16-BE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  85.7  85.7  |   0.0   0.0  |
+UTF-16-LE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |  87.6  87.6  |   0.0   0.0  |
+UTF-32-BE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  | 100.0 100.0  |   0.0   0.0  |
+UTF-32-LE                5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  | 100.0 100.0  |   0.0   0.0  |
+UTF-8                    5000  |  84.4  84.4  |  88.3  88.3  |  88.5  88.5  |  
88.5  88.5  |  92.4  92.4  |  93.3  93.3  |
+windows-1250            10000  |  31.3  31.3  |  32.3  32.3  |  32.9  32.9  |  
32.9  32.9  |  22.3  56.9  |   0.0   0.0  |
+windows-1251            10000  |  95.8  95.8  |  95.8  95.8  |  95.9  95.9  |  
95.9  95.9  |  76.0  76.1  |  80.9  80.9  |
+windows-1252            10000  |  20.8  20.8  |  39.1  39.1  |  39.4  39.4  |  
39.4  39.4  |  12.8  86.6  |   0.0  99.1  |
+windows-1253            10000  |  90.2  90.2  |  90.3  90.3  |  90.3  90.3  |  
90.3  90.3  |   8.2  86.8  |   0.1  91.5  |
+windows-1254            10000  |  80.1  80.1  |  80.0  80.0  |  80.0  80.0  |  
80.0  80.0  |  12.3  73.1  |   0.0   0.0  |
+windows-1255            10000  |  97.8  97.8  |  97.8  97.8  |  97.8  97.8  |  
97.8  97.8  |   9.4  40.2  |  98.1  99.4  |
+windows-1256            10000  |  99.1  99.1  |  99.2  99.2  |  99.2  99.2  |  
99.2  99.2  |  88.1  91.5  |   0.0   0.0  |
+windows-1257            10000  |  72.0  72.0  |  67.2  67.2  |  67.3  67.3  |  
67.3  67.3  |   0.0   0.0  |   0.0   0.0  |
+windows-1258            10000  |  96.1  96.1  |  96.2  96.2  |  96.3  96.3  |  
96.3  96.3  |   0.0   0.0  |   0.0   0.0  |
+windows-874             10000  |  95.3  95.3  |  95.8  95.8  |  96.0  96.0  |  
96.0  96.0  |   0.0   0.0  |   0.0   0.0  |
+x-EUC-TW                 5000  |  99.5  99.5  |  99.3  99.3  |  99.3  99.3  |  
99.3  99.3  |   0.0   0.0  |  67.1  67.1  |
+x-ISO-2022-CN-CNS        5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+x-MacRoman              10000  |  58.5  58.5  |  56.4  56.4  |  56.6  56.6  |  
56.6  56.6  |   0.0   0.0  |   0.0   0.0  |
+x-mac-cyrillic          10000  |  81.0  81.0  |  81.2  81.2  |  81.2  81.2  |  
81.2  81.2  |   0.0   0.0  |  62.3  62.3  |
+------------------------------------------------------------------------------------------------------------------------
+OVERALL                 326397  |  45.2  45.4  |  49.9  50.2  |  49.9  50.2  | 
 49.9  50.2  |  29.8  52.4  |  26.5  35.2  |
+  Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | 
R%=strict | S%=soft
+  µs/sample                    |        10.7  |         8.2  |         8.1  |  
       8.1  |        21.6  |         4.1  |
+
+=== Probe length: 100B ===
+                            N  | --- ML ablation 
--------------------------------- | --- Baselines ----------------------- |
+Charset                        | Stat R%   S%  | +ISO R%   S%  | +CJK R%   S%  
| All  R%   S%  | ICU4J R%   S%  | juniv R%   S%  |
+----------------------------------------------------------------------------------------------------------------------------------
+Big5-HKSCS               5000  |  99.7  99.7  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |   0.0  97.6  |   0.0  69.7  |
+EUC-JP                   5000  |  95.8  95.8  |  95.8  95.8  |  95.8  95.8  |  
95.8  95.8  |  97.0  97.0  |  95.7  95.7  |
+EUC-KR                   5000  |  95.3  95.3  |  95.4  95.4  |  95.4  95.4  |  
95.4  95.4  |  98.4  98.4  |  99.1  99.1  |
+GB18030                  5000  |  85.1  85.1  |  85.9  85.9  |  85.9  85.9  |  
85.9  85.9  |  95.9  95.9  |  97.9  97.9  |
+IBM1047                 10000  |  49.5  94.7  |  49.6  94.9  |  49.6  94.9  |  
49.6  94.9  |   0.0  83.2  |   0.0   0.0  |
+IBM420-ltr              10000  |  92.6  95.0  |  92.6  95.0  |  92.6  95.0  |  
92.6  95.0  |   0.0  95.0  |   0.0   0.0  |
+IBM420-rtl              10000  |  94.1  94.6  |  94.1  94.6  |  94.1  94.6  |  
94.1  94.6  |   0.0  96.3  |   0.0   0.0  |
+IBM424-ltr               6195  |  91.6  91.8  |  74.8  75.0  |  74.8  75.0  |  
74.8  75.0  |   0.0  92.4  |   0.0   0.0  |
+IBM424-rtl               4717  |  93.0  95.1  |  74.2  75.5  |  74.2  75.5  |  
74.2  75.5  |   0.0  89.4  |   0.0   0.0  |
+IBM437                   7516  |   0.0  79.0  |   0.0  77.8  |   0.0  77.8  |  
 0.0  77.8  |   0.0   0.0  |   0.0   0.0  |
+IBM500                  10000  |  56.0  94.9  |  56.1  95.1  |  56.1  95.1  |  
56.1  95.1  |  83.5  83.5  |   0.0   0.0  |
+IBM850                  10000  |  79.8  79.8  |  79.5  79.5  |  79.5  79.5  |  
79.5  79.5  |   0.0   0.0  |   0.0   0.0  |
+IBM852                  10000  |  77.4  77.4  |  78.1  78.1  |  78.1  78.1  |  
78.1  78.1  |   0.0   0.0  |   0.0   0.0  |
+IBM855                  10000  |  94.6  94.6  |  94.7  94.7  |  94.7  94.7  |  
94.7  94.7  |   0.0   0.0  |  99.0  99.0  |
+IBM866                   8442  |  95.9  95.9  |  96.0  96.0  |  96.0  96.0  |  
96.0  96.0  |  78.8  78.8  |  99.3  99.3  |
+ISO-2022-CN              5000  |   0.0   0.0  |  99.1  99.1  |  99.1  99.1  |  
99.1  99.1  |  99.0  99.0  |   0.0   0.0  |
+ISO-2022-JP              5000  |   0.0   0.0  |  99.1  99.1  |  99.1  99.1  |  
99.1  99.1  |  98.8  98.8  |  99.1  99.1  |
+ISO-2022-KR              5000  |   0.0   0.0  |  99.6  99.6  |  99.6  99.6  |  
99.6  99.6  |  99.6  99.6  |  99.6  99.6  |
+ISO-8859-16             10000  |  92.0  92.0  |  89.1  89.1  |  89.1  89.1  |  
89.1  89.1  |   0.0   0.0  |   0.0   0.0  |
+ISO-8859-3               5195  |  78.4  78.4  |  78.5  78.5  |  78.5  78.5  |  
78.5  78.5  |   0.0   0.0  |   0.0   0.0  |
+KOI8-R                   8411  |  95.7  99.0  |  95.7  99.0  |  95.7  99.0  |  
95.7  99.0  |  90.5  90.5  |  99.4  99.4  |
+KOI8-U                   5921  |  96.9  99.5  |  96.9  99.5  |  96.9  99.5  |  
96.9  99.5  |   0.0  85.4  |   0.0  98.9  |
+Shift_JIS                5000  |  97.7  97.7  |  98.0  98.0  |  98.0  98.0  |  
98.0  98.0  |  97.8  97.8  |  98.7  98.7  |
+US-ASCII                 5000  |   0.0   0.8  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+UTF-16-BE                5000  |  94.0  94.1  |  93.9  93.9  |  93.9  93.9  |  
93.9  93.9  |  85.5  85.5  |   0.0   0.0  |
+UTF-16-LE                5000  |  94.4  94.4  |  94.4  94.4  |  94.4  94.4  |  
94.4  94.4  |  87.0  87.0  |   0.0   0.0  |
+UTF-32-BE                5000  |  94.9  94.9  |  94.7  94.7  |  94.7  94.7  |  
94.7  94.7  | 100.0 100.0  |   0.0   0.0  |
+UTF-32-LE                5000  |  95.5  95.5  |  95.5  95.5  |  95.5  95.5  |  
95.5  95.5  | 100.0 100.0  |   0.0   0.0  |
+UTF-8                    5000  |  90.6  90.6  |  93.2  93.2  |  93.2  93.2  |  
93.2  93.2  |  97.3  97.3  |  97.2  97.2  |
+windows-1250            10000  |  55.3  55.3  |  56.4  56.4  |  56.5  56.5  |  
56.5  56.5  |  34.4  62.5  |   0.0   0.0  |
+windows-1251            10000  |  98.8  98.8  |  98.8  98.8  |  98.8  98.8  |  
98.8  98.8  |  85.7  85.7  |  86.2  86.2  |
+windows-1252            10000  |  34.4  34.4  |  40.0  40.0  |  40.4  40.4  |  
40.4  40.4  |  21.3  91.4  |   0.0  99.4  |
+windows-1253            10000  |  97.1  97.1  |  97.1  97.1  |  97.1  97.1  |  
97.1  97.1  |  15.4  93.7  |   0.2  95.3  |
+windows-1254            10000  |  93.1  93.1  |  93.0  93.0  |  93.0  93.0  |  
93.0  93.0  |  21.2  83.8  |   0.0   0.0  |
+windows-1255            10000  |  99.0  99.0  |  99.0  99.0  |  99.0  99.0  |  
99.0  99.0  |  14.6  46.8  |  99.0  99.9  |
+windows-1256            10000  |  99.6  99.6  |  99.6  99.6  |  99.6  99.6  |  
99.6  99.6  |  94.6  96.1  |   0.0   0.0  |
+windows-1257            10000  |  87.1  87.1  |  85.7  85.7  |  85.7  85.7  |  
85.7  85.7  |   0.0   0.0  |   0.0   0.0  |
+windows-1258            10000  |  97.0  97.0  |  97.0  97.0  |  97.0  97.0  |  
97.0  97.0  |   0.0   0.0  |   0.0   0.0  |
+windows-874             10000  |  97.7  97.7  |  97.7  97.7  |  97.7  97.7  |  
97.7  97.7  |   0.0   0.0  |   0.0   0.0  |
+x-EUC-TW                 5000  |  99.7  99.7  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |   0.0   0.0  |  67.7  67.7  |
+x-ISO-2022-CN-CNS        5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+x-MacRoman              10000  |  80.0  80.0  |  79.2  79.2  |  79.2  79.2  |  
79.2  79.2  |   0.0   0.0  |   0.0   0.0  |
+x-mac-cyrillic          10000  |  88.6  88.6  |  88.7  88.7  |  88.7  88.7  |  
88.7  88.7  |   0.0   0.0  |  70.1  70.1  |
+------------------------------------------------------------------------------------------------------------------------
+OVERALL                 326397  |  78.0  82.6  |  82.1  86.7  |  82.1  86.7  | 
 82.1  86.7  |  33.4  56.3  |  27.6  36.4  |
+  Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | 
R%=strict | S%=soft
+  µs/sample                    |        10.8  |         8.2  |         8.0  |  
       7.9  |        36.3  |         5.7  |
+
+=== Probe length: 200B ===
+                            N  | --- ML ablation 
--------------------------------- | --- Baselines ----------------------- |
+Charset                        | Stat R%   S%  | +ISO R%   S%  | +CJK R%   S%  
| All  R%   S%  | ICU4J R%   S%  | juniv R%   S%  |
+----------------------------------------------------------------------------------------------------------------------------------
+Big5-HKSCS               5000  |  99.8  99.8  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |   0.0  97.8  |   0.0  69.7  |
+EUC-JP                   5000  |  98.4  98.4  |  98.4  98.4  |  98.4  98.4  |  
98.4  98.4  |  99.2  99.2  |  97.9  97.9  |
+EUC-KR                   5000  |  99.1  99.1  |  99.1  99.1  |  99.1  99.1  |  
99.1  99.1  |  99.8  99.8  |  99.8  99.8  |
+GB18030                  5000  |  94.0  94.0  |  94.5  94.5  |  94.5  94.5  |  
94.5  94.5  |  98.4  98.4  |  99.4  99.4  |
+IBM1047                 10000  |  52.7  95.7  |  52.8  95.8  |  52.8  95.8  |  
52.8  95.8  |   0.0  86.8  |   0.0   0.0  |
+IBM420-ltr              10000  |  95.3  96.1  |  95.3  96.1  |  95.3  96.1  |  
95.3  96.1  |   0.0  96.8  |   0.0   0.0  |
+IBM420-rtl              10000  |  95.3  95.4  |  95.3  95.4  |  95.3  95.4  |  
95.3  95.4  |   0.0  97.3  |   0.0   0.0  |
+IBM424-ltr               6195  |  94.9  94.9  |  87.8  87.8  |  87.8  87.8  |  
87.8  87.8  |   0.0  93.4  |   0.0   0.0  |
+IBM424-rtl               4717  |  95.6  96.0  |  86.7  87.2  |  86.7  87.2  |  
86.7  87.2  |   0.0  87.3  |   0.0   0.0  |
+IBM437                   7516  |   0.0  92.1  |   0.0  92.0  |   0.0  92.0  |  
 0.0  92.0  |   0.0   0.0  |   0.0   0.0  |
+IBM500                  10000  |  56.0  96.0  |  56.0  96.0  |  56.0  96.0  |  
56.0  96.0  |  86.8  86.8  |   0.0   0.0  |
+IBM850                  10000  |  92.2  92.2  |  92.3  92.3  |  92.3  92.3  |  
92.3  92.3  |   0.0   0.0  |   0.0   0.0  |
+IBM852                  10000  |  90.0  90.0  |  90.7  90.7  |  90.7  90.7  |  
90.7  90.7  |   0.0   0.0  |   0.0   0.0  |
+IBM855                  10000  |  95.7  95.7  |  95.7  95.7  |  95.7  95.7  |  
95.7  95.7  |   0.0   0.0  |  99.8  99.8  |
+IBM866                   8442  |  96.2  96.2  |  96.2  96.2  |  96.2  96.2  |  
96.2  96.2  |  89.5  89.5  |  99.7  99.7  |
+ISO-2022-CN              5000  |   0.0   0.0  |  99.5  99.5  |  99.5  99.5  |  
99.5  99.5  |  99.4  99.4  |   0.0   0.0  |
+ISO-2022-JP              5000  |   0.0   0.0  |  99.6  99.6  |  99.6  99.6  |  
99.6  99.6  |  99.4  99.4  |  99.6  99.6  |
+ISO-2022-KR              5000  |   0.0   0.0  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  99.8  99.8  |  99.8  99.8  |
+ISO-8859-16             10000  |  95.3  95.3  |  94.9  94.9  |  94.9  94.9  |  
94.9  94.9  |   0.0   0.0  |   0.0   0.0  |
+ISO-8859-3               5195  |  93.9  93.9  |  94.3  94.3  |  94.3  94.3  |  
94.3  94.3  |   0.0   0.0  |   0.0   0.0  |
+KOI8-R                   8411  |  98.6  99.8  |  98.6  99.8  |  98.6  99.8  |  
98.6  99.8  |  95.8  95.8  |  99.7  99.7  |
+KOI8-U                   5921  |  98.9  99.9  |  98.9  99.9  |  98.9  99.9  |  
98.9  99.9  |   0.0  92.5  |   0.0  99.3  |
+Shift_JIS                5000  |  99.3  99.3  |  99.4  99.4  |  99.4  99.4  |  
99.4  99.4  |  99.7  99.7  |  99.7  99.7  |
+US-ASCII                 5000  |   0.0   0.5  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+UTF-16-BE                5000  |  94.5  94.5  |  94.4  94.4  |  94.4  94.4  |  
94.4  94.4  |  84.9  84.9  |   0.0   0.0  |
+UTF-16-LE                5000  |  94.4  94.4  |  94.4  94.4  |  94.4  94.4  |  
94.4  94.4  |  86.8  86.8  |   0.0   0.0  |
+UTF-32-BE                5000  |  95.2  95.2  |  95.0  95.0  |  95.0  95.0  |  
95.0  95.0  | 100.0 100.0  |   0.0   0.0  |
+UTF-32-LE                5000  |  95.5  95.5  |  95.5  95.5  |  95.5  95.5  |  
95.5  95.5  | 100.0 100.0  |   0.0   0.0  |
+UTF-8                    5000  |  96.1  96.1  |  96.7  96.7  |  96.7  96.7  |  
96.7  96.7  |  99.5  99.5  |  98.1  98.1  |
+windows-1250            10000  |  73.6  73.6  |  74.3  74.3  |  74.4  74.4  |  
74.4  74.4  |  49.3  70.3  |   0.0   0.0  |
+windows-1251            10000  |  99.8  99.8  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  92.6  92.6  |  89.1  89.1  |
+windows-1252            10000  |  47.7  47.7  |  48.5  48.5  |  48.7  48.7  |  
48.7  48.7  |  28.5  94.0  |   0.0  99.5  |
+windows-1253            10000  |  99.4  99.4  |  99.4  99.4  |  99.4  99.4  |  
99.4  99.4  |  22.0  97.2  |   0.3  97.6  |
+windows-1254            10000  |  98.4  98.4  |  98.4  98.4  |  98.4  98.4  |  
98.4  98.4  |  33.4  93.5  |   0.0   0.0  |
+windows-1255            10000  |  99.8  99.8  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  20.2  57.9  |  99.7  99.9  |
+windows-1256            10000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |  97.1  98.2  |   0.0   0.0  |
+windows-1257            10000  |  95.1  95.1  |  95.1  95.1  |  95.1  95.1  |  
95.1  95.1  |   0.0   0.0  |   0.0   0.0  |
+windows-1258            10000  |  99.0  99.0  |  99.0  99.0  |  99.0  99.0  |  
99.0  99.0  |   0.0   0.0  |   0.0   0.0  |
+windows-874             10000  |  99.4  99.4  |  99.4  99.4  |  99.4  99.4  |  
99.4  99.4  |   0.0   0.0  |   0.0   0.0  |
+x-EUC-TW                 5000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |   0.0   0.0  |  67.7  67.7  |
+x-ISO-2022-CN-CNS        5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+x-MacRoman              10000  |  93.5  93.5  |  93.5  93.5  |  93.5  93.5  |  
93.5  93.5  |   0.0   0.0  |   0.0   0.0  |
+x-mac-cyrillic          10000  |  93.5  93.5  |  93.5  93.5  |  93.5  93.5  |  
93.5  93.5  |   0.0   0.0  |  74.9  74.9  |
+------------------------------------------------------------------------------------------------------------------------
+OVERALL                 326397  |  82.1  86.9  |  86.5  91.2  |  86.5  91.3  | 
 86.5  91.3  |  35.8  58.6  |  28.0  36.9  |
+  Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | 
R%=strict | S%=soft
+  µs/sample                    |        12.4  |         9.2  |         8.7  |  
       8.9  |        60.7  |         8.2  |
+
+=== Probe length: full ===
+                            N  | --- ML ablation 
--------------------------------- | --- Baselines ----------------------- |
+Charset                        | Stat R%   S%  | +ISO R%   S%  | +CJK R%   S%  
| All  R%   S%  | ICU4J R%   S%  | juniv R%   S%  |
+----------------------------------------------------------------------------------------------------------------------------------
+Big5-HKSCS               5000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |   0.0  97.8  |   0.0  69.7  |
+EUC-JP                   5000  |  99.6  99.6  |  99.6  99.6  |  99.6  99.6  |  
99.6  99.6  |  99.9  99.9  |  99.0  99.0  |
+EUC-KR                   5000  |  99.7  99.7  |  99.7  99.7  |  99.7  99.7  |  
99.7  99.7  | 100.0 100.0  |  99.9  99.9  |
+GB18030                  5000  |  97.1  97.1  |  97.5  97.5  |  97.5  97.5  |  
97.5  97.5  |  99.4  99.4  |  99.7  99.7  |
+IBM1047                 10000  |  59.1  96.0  |  59.1  96.0  |  59.1  96.0  |  
59.1  96.0  |   0.0  90.6  |   0.0   0.0  |
+IBM420-ltr              10000  |  96.0  96.2  |  96.0  96.2  |  96.0  96.2  |  
96.0  96.2  |   0.0  97.8  |   0.0   0.0  |
+IBM420-rtl              10000  |  95.6  95.7  |  95.6  95.7  |  95.6  95.7  |  
95.6  95.7  |   0.0  98.1  |   0.0   0.0  |
+IBM424-ltr               6195  |  95.6  95.6  |  95.5  95.6  |  95.5  95.6  |  
95.5  95.6  |   0.0  93.1  |   0.0   0.0  |
+IBM424-rtl               4717  |  96.2  96.3  |  96.2  96.3  |  96.2  96.3  |  
96.2  96.3  |   0.0  87.3  |   0.0   0.0  |
+IBM437                   7516  |   0.0  95.9  |   0.0  96.0  |   0.0  96.0  |  
 0.0  96.0  |   0.0   0.0  |   0.0   0.0  |
+IBM500                  10000  |  52.6  96.3  |  52.6  96.3  |  52.6  96.3  |  
52.6  96.3  |  90.6  90.6  |   0.0   0.0  |
+IBM850                  10000  |  95.8  95.8  |  95.9  95.9  |  95.9  95.9  |  
95.9  95.9  |   0.0   0.0  |   0.0   0.0  |
+IBM852                  10000  |  95.2  95.2  |  95.5  95.5  |  95.5  95.5  |  
95.5  95.5  |   0.0   0.0  |   0.0   0.0  |
+IBM855                  10000  |  95.9  95.9  |  95.9  95.9  |  95.9  95.9  |  
95.9  95.9  |   0.0   0.0  |  99.9  99.9  |
+IBM866                   8442  |  96.2  96.2  |  96.2  96.2  |  96.2  96.2  |  
96.2  96.2  |  95.8  95.8  |  99.8  99.8  |
+ISO-2022-CN              5000  |   0.0   0.0  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  99.7  99.7  |   0.0   0.0  |
+ISO-2022-JP              5000  |   0.0   0.0  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  99.7  99.7  |  99.8  99.8  |
+ISO-2022-KR              5000  |   0.0   0.0  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |  99.9  99.9  |  99.9  99.9  |
+ISO-8859-16             10000  |  98.5  98.5  |  98.5  98.5  |  98.5  98.5  |  
98.5  98.5  |   0.0   0.0  |   0.0   0.0  |
+ISO-8859-3               5195  |  98.4  98.4  |  98.7  98.7  |  98.7  98.7  |  
98.7  98.7  |   0.0   0.0  |   0.0   0.0  |
+KOI8-R                   8411  |  99.4  99.8  |  99.4  99.8  |  99.4  99.8  |  
99.4  99.8  |  98.0  98.0  |  99.8  99.8  |
+KOI8-U                   5921  |  99.5  99.9  |  99.5  99.9  |  99.5  99.9  |  
99.5  99.9  |   0.0  96.5  |   0.0  99.8  |
+Shift_JIS                5000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  | 100.0 100.0  |  99.9  99.9  |
+US-ASCII                 5000  |   0.0   0.2  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+UTF-16-BE                5000  |  94.7  94.7  |  94.7  94.7  |  94.7  94.7  |  
94.7  94.7  |  84.6  84.6  |   0.0   0.0  |
+UTF-16-LE                5000  |  94.5  94.5  |  94.5  94.5  |  94.5  94.5  |  
94.5  94.5  |  86.6  86.6  |   0.0   0.0  |
+UTF-32-BE                5000  |  95.2  95.2  |  95.2  95.2  |  95.2  95.2  |  
95.2  95.2  | 100.0 100.0  |   0.0   0.0  |
+UTF-32-LE                5000  |  95.5  95.5  |  95.5  95.5  |  95.5  95.5  |  
95.5  95.5  | 100.0 100.0  |   0.0   0.0  |
+UTF-8                    5000  |  98.8  98.8  |  98.0  98.0  |  98.1  98.1  |  
98.1  98.1  |  99.9  99.9  |  97.7  97.7  |
+windows-1250            10000  |  83.5  83.5  |  83.7  83.7  |  83.8  83.8  |  
83.8  83.8  |  65.4  78.8  |   0.0   0.0  |
+windows-1251            10000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |  96.4  96.4  |  90.7  90.7  |
+windows-1252            10000  |  63.8  63.8  |  64.0  64.0  |  64.0  64.0  |  
64.0  64.0  |  41.4  96.2  |   0.0  99.6  |
+windows-1253            10000  |  99.8  99.8  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |  33.1  98.9  |   0.4  99.0  |
+windows-1254            10000  |  99.3  99.3  |  99.3  99.3  |  99.3  99.3  |  
99.3  99.3  |  48.7  97.6  |   0.0   0.0  |
+windows-1255            10000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |  30.4  71.6  |  99.9 100.0  |
+windows-1256            10000  |  99.9  99.9  |  99.9  99.9  |  99.9  99.9  |  
99.9  99.9  |  98.1  99.0  |   0.0   0.0  |
+windows-1257            10000  |  98.5  98.5  |  98.6  98.6  |  98.6  98.6  |  
98.6  98.6  |   0.0   0.0  |   0.0   0.0  |
+windows-1258            10000  |  99.6  99.6  |  99.6  99.6  |  99.6  99.6  |  
99.6  99.6  |   0.0   0.0  |   0.0   0.0  |
+windows-874             10000  |  99.7  99.7  |  99.8  99.8  |  99.8  99.8  |  
99.8  99.8  |   0.0   0.0  |   0.0   0.0  |
+x-EUC-TW                 5000  |  99.9  99.9  | 100.0 100.0  | 100.0 100.0  | 
100.0 100.0  |   0.0   0.0  |  67.7  67.7  |
+x-ISO-2022-CN-CNS        5000  |   0.0   0.0  |   0.0   0.0  |   0.0   0.0  |  
 0.0   0.0  |   0.0   0.0  |   0.0   0.0  |
+x-MacRoman              10000  |  98.5  98.5  |  98.6  98.6  |  98.7  98.7  |  
98.7  98.7  |   0.0   0.0  |   0.0   0.0  |
+x-mac-cyrillic          10000  |  96.9  96.9  |  96.9  96.9  |  96.9  96.9  |  
96.9  96.9  |   0.0   0.0  |  75.0  75.0  |
+------------------------------------------------------------------------------------------------------------------------
+OVERALL                 326397  |  84.1  88.8  |  88.7  93.4  |  88.7  93.4  | 
 88.7  93.4  |  38.4  60.3  |  28.1  37.0  |
+  Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | 
R%=strict | S%=soft
+  µs/sample                    |        16.4  |        11.5  |        10.8  |  
      11.4  |       152.6  |        18.0  |
diff --git 
a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java
 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java
new file mode 100644
index 0000000000..1eb5f2b815
--- /dev/null
+++ 
b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java
@@ -0,0 +1,162 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.ml.chardetect;
+
+import static java.nio.charset.StandardCharsets.UTF_8;
+import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
+
+import java.nio.charset.Charset;
+import java.util.List;
+
+import org.junit.jupiter.api.Test;
+
+import org.apache.tika.detect.DefaultEncodingDetector;
+import org.apache.tika.detect.EncodingResult;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.ParseContext;
+
+/**
+ * Regression tests for charset detection edge-cases that surfaced during
+ * integration testing with the CharSoup language-aware detector.
+ *
+ * <ul>
+ *   <li><b>ASCII-only HTML</b> (Solr integration test regression): simple
+ *       {@code <html><body>…</body></html>} content written as UTF-8 was
+ *       returned as {@code ISO-8859-1} by the old detector chain.
+ *       The correct answer is {@code UTF-8}.</li>
+ *   <li><b>Short plain-text English</b> (TXTParserTest regression): a short
+ *       English paragraph whose bytes are all in the ASCII range was returned
+ *       as {@code ISO-8859-1} and in some cases as {@code UTF-16}.
+ *       The ML-based chain must not return UTF-16 for ASCII-range input.</li>
+ *   <li><b>Shift-JIS ZIP entry name</b>: 9 raw bytes encoding {@code 文章1.txt}
+ *       in Shift-JIS must be detected as {@code Shift_JIS}, not Big5-HKSCS.
+ *       The raw ML logits favour Big5-HKSCS; the CharSoup language signal must
+ *       override the model ranking.</li>
+ * </ul>
+ */
+public class CharsetDetectionRegressionTest {
+
+    // 文章1.txt in Shift-JIS (9 raw bytes from a real zip entry)
+    private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874");
+
+    // Pure-ASCII HTML without a meta charset declaration — mirrors what the
+    // Solr integration test wrote before the meta-tag workaround was added.
+    // The old detector returned ISO-8859-1 for this without any meta tag.
+    // The new detector required adding <meta charset="UTF-8"> to avoid
+    // returning an unexpected charset.
+    private static final byte[] ASCII_HTML_NO_META =
+            "<html><body>initial</body></html>".getBytes(UTF_8);
+
+    // English plain text from TXTParserTest — all bytes in the ASCII range
+    private static final byte[] ENGLISH_TEXT =
+            ("Hello, World! This is simple UTF-8 text content written"
+            + " in English to test autodetection of both the character"
+            + " encoding and the language of the input 
stream.").getBytes(UTF_8);
+
+    // -----------------------------------------------------------------------
+    // Solr integration-test regression
+    // -----------------------------------------------------------------------
+
+    /**
+     * ASCII HTML <em>without</em> a meta charset declaration must not be
+     * returned as UTF-16.
+     *
+     * <p>The old detector returned {@code ISO-8859-1} here without requiring
+     * any meta tag.  The new detector regressed: without a meta tag it started
+     * returning an unexpected charset, which caused the Solr integration test
+     * to fail.  The workaround was to add {@code <meta charset="UTF-8">} to
+     * the generated HTML — but we should not need to do that.  UTF-8,
+     * US-ASCII, and ISO-8859-1 are all acceptable; UTF-16 is not.</p>
+     */
+    @Test
+    public void asciiHtmlWithoutMetaIsNotDetectedAsUtf16() throws Exception {
+        DefaultEncodingDetector detector = new DefaultEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(ASCII_HTML_NO_META)) {
+            List<EncodingResult> results =
+                    detector.detect(tis, new Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(), "detector returned no result for 
ASCII HTML");
+            Charset top = results.get(0).getCharset();
+            assertFalse(top.name().startsWith("UTF-16"),
+                    "ASCII HTML without meta tag must not be detected as 
UTF-16, got: "
+                            + top.name());
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // TXTParser regression
+    // -----------------------------------------------------------------------
+
+    /**
+     * A plain-English paragraph whose bytes are all in the ASCII range must
+     * be returned as {@code windows-1252} — the HTML5/WHATWG default for
+     * unlabeled 8-bit Western content and the statistical fallback for
+     * pure-ASCII bytes in the ML-based detector chain.
+     */
+    @Test
+    public void englishPlainTextIsDetectedAsWindows1252() throws Exception {
+        DefaultEncodingDetector detector = new DefaultEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(ENGLISH_TEXT)) {
+            List<EncodingResult> results =
+                    detector.detect(tis, new Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(), "detector returned no result for 
English text");
+            Charset top = results.get(0).getCharset();
+            assertEquals("windows-1252", top.name(),
+                    "Pure-ASCII English text should be detected as 
windows-1252, got: "
+                            + top.name());
+        }
+    }
+
+    // -----------------------------------------------------------------------
+    // Shift-JIS ZIP entry name
+    // -----------------------------------------------------------------------
+
+    /**
+     * 9 raw bytes encoding {@code 文章1.txt} in Shift-JIS must be identified
+     * as {@code Shift_JIS}.
+     *
+     * <p>The same bytes are structurally valid Big5-HKSCS and ranked higher by
+     * the raw ML logits.  CharSoup must override the model ranking using the
+     * Japanese language signal.  ZipParser feeds entry names as raw byte 
arrays
+     * to the encoding detector, so a wrong answer here means garbled filenames
+     * in Japanese zip archives.</p>
+     */
+    @Test
+    public void sjisZipEntryNameIsDetectedAsShiftJis() throws Exception {
+        DefaultEncodingDetector detector = new DefaultEncodingDetector();
+        try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) {
+            List<EncodingResult> results =
+                    detector.detect(tis, new Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(),
+                    "detector returned no result for SJIS filename bytes");
+            Charset top = results.get(0).getCharset();
+            assertEquals("Shift_JIS", top.name(),
+                    "SJIS zip entry bytes should be detected as Shift_JIS, 
got: " + top.name());
+        }
+    }
+
+    // -----------------------------------------------------------------------
+
+    private static byte[] hexToBytes(String hex) {
+        byte[] b = new byte[hex.length() / 2];
+        for (int i = 0; i < b.length; i++) {
+            b[i] = (byte) Integer.parseInt(hex.substring(i * 2, i * 2 + 2), 
16);
+        }
+        return b;
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
new file mode 100644
index 0000000000..33a6bb38fd
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java
@@ -0,0 +1,708 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup;
+
+import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
+import java.io.DataInputStream;
+import java.io.DataOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * Dense INT8 generative character n-gram model for languageness scoring.
+ *
+ * <p>Computes an approximate per-n-gram average log P(text | language).
+ * Higher scores indicate the decoded text is more consistent with the named
+ * language.  The score is used to arbitrate between candidate charsets when
+ * statistical decoders disagree on script or language.
+ *
+ * <h3>Feature types</h3>
+ * <ul>
+ *   <li><b>CJK languages</b> (Han, Hiragana, Katakana): character unigrams
+ *       and bigrams extracted from CJK/kana codepoints.</li>
+ *   <li><b>Non-CJK languages</b>: character unigrams, bigrams (with
+ *       word-boundary sentinels), and trigrams (with sentinels).</li>
+ * </ul>
+ *
+ * <p>Log-probabilities are quantized to unsigned INT8 over the range
+ * [{@link #LOGP_MIN}, 0] and stored in dense byte arrays.
+ *
+ * <h3>Binary format ({@code GLM1} v2)</h3>
+ * <pre>
+ *   INT  magic    = 0x474C4D31
+ *   INT  version  = 2
+ *   INT  numLangs
+ *   INT  cjkUnigramBuckets
+ *   INT  cjkBigramBuckets
+ *   INT  noncjkUnigramBuckets
+ *   INT  noncjkBigramBuckets
+ *   INT  noncjkTrigramBuckets
+ *   For each language:
+ *     SHORT  codeLen
+ *     BYTES  langCode (UTF-8)
+ *     BYTE   isCjk (0|1)
+ *     FLOAT  scoreMean   (μ of score distribution on training data)
+ *     FLOAT  scoreStdDev (σ of score distribution on training data)
+ *     BYTES  unigramTable  [cjkUnigramBuckets | noncjkUnigramBuckets]
+ *     BYTES  bigramTable   [cjkBigramBuckets  | noncjkBigramBuckets]
+ *     BYTES  trigramTable  [noncjkTrigramBuckets] (absent for CJK)
+ * </pre>
+ */
+public class GenerativeLanguageModel {
+
+    // ---- Bucket counts ----
+
+    public static final int CJK_UNIGRAM_BUCKETS    =  8_192;
+    public static final int CJK_BIGRAM_BUCKETS     = 32_768;
+    public static final int NONCJK_UNIGRAM_BUCKETS =  8_192;
+    public static final int NONCJK_BIGRAM_BUCKETS  =  8_192;
+    public static final int NONCJK_TRIGRAM_BUCKETS = 16_384;
+
+    /**
+     * Quantization floor.  Log-probabilities below this value are clamped
+     * before quantizing; values stored in the table never go lower.
+     */
+    public static final float LOGP_MIN = -18.0f;
+
+    private static final int MAGIC   = 0x474C4D31; // "GLM1"
+    private static final int VERSION = 2;
+
+    // ---- FNV-1a basis constants ----
+
+    /**
+     * Bigram basis shared with {@link ScriptAwareFeatureExtractor} so that
+     * identical text produces the same bucket indices for both models.
+     */
+    static final int BIGRAM_BASIS         = 
ScriptAwareFeatureExtractor.BIGRAM_BASIS;
+
+    /**
+     * CJK unigram basis shared with {@link ScriptAwareFeatureExtractor}.
+     */
+    static final int CJK_UNIGRAM_BASIS    = 
ScriptAwareFeatureExtractor.UNIGRAM_BASIS;
+
+    /** Distinct salt for non-CJK character unigrams (not in discriminative 
model). */
+    static final int NONCJK_UNIGRAM_BASIS = 0x1a3f7c4e;
+
+    /** Distinct salt for character trigrams (not in discriminative model). */
+    static final int TRIGRAM_BASIS        = 0x7e3d9b21;
+
+    /** Word-boundary sentinel codepoint, matching the discriminative model. */
+    static final int SENTINEL = '_';
+
+    // ---- Model state ----
+
+    private final List<String>         langIds;
+    private final Map<String, Integer> langIndex;
+    private final boolean[]  isCjk;
+    private final byte[][]   unigramTables;   // [langIdx][bucket]
+    private final byte[][]   bigramTables;    // [langIdx][bucket]
+    private final byte[][]   trigramTables;   // [langIdx][bucket]; null entry 
for CJK langs
+    private final float[]    scoreMeans;      // μ per language (from training 
data)
+    private final float[]    scoreStdDevs;    // σ per language (from training 
data)
+
+    private GenerativeLanguageModel(
+            List<String> langIds,
+            boolean[]    isCjk,
+            byte[][]     unigramTables,
+            byte[][]     bigramTables,
+            byte[][]     trigramTables,
+            float[]      scoreMeans,
+            float[]      scoreStdDevs) {
+        this.langIds       = Collections.unmodifiableList(new 
ArrayList<>(langIds));
+        this.isCjk         = isCjk;
+        this.unigramTables = unigramTables;
+        this.bigramTables  = bigramTables;
+        this.trigramTables = trigramTables;
+        this.scoreMeans    = scoreMeans;
+        this.scoreStdDevs  = scoreStdDevs;
+        Map<String, Integer> idx = new HashMap<>(langIds.size() * 2);
+        for (int i = 0; i < langIds.size(); i++) {
+            idx.put(langIds.get(i), i);
+        }
+        this.langIndex = Collections.unmodifiableMap(idx);
+    }
+
+    // ---- Public API ----
+
+    public List<String> getLanguages() {
+        return langIds;
+    }
+
+    public boolean isCjk(String language) {
+        Integer i = langIndex.get(language);
+        return i != null && isCjk[i];
+    }
+
+    /**
+     * Per-n-gram average log-probability of {@code text} under {@code 
language}.
+     *
+     * @return a value in [{@link #LOGP_MIN}, 0], or {@link Float#NaN} if the
+     *         language is unknown or the text yields no scorable n-grams.
+     */
+    public float score(String text, String language) {
+        if (text == null || text.isEmpty()) {
+            return Float.NaN;
+        }
+        Integer li = langIndex.get(language);
+        if (li == null) {
+            return Float.NaN;
+        }
+        String preprocessed = CharSoupFeatureExtractor.preprocess(text);
+        if (preprocessed.isEmpty()) {
+            return Float.NaN;
+        }
+
+        double[] sum = {0.0};
+        int[]    cnt = {0};
+
+        if (isCjk[li]) {
+            byte[] uniT = unigramTables[li];
+            byte[] biT  = bigramTables[li];
+            extractCjkNgrams(preprocessed,
+                h -> {
+                    sum[0] += dequantize(uniT[h % CJK_UNIGRAM_BUCKETS]);
+                    cnt[0]++;
+                },
+                h -> {
+                    sum[0] += dequantize(biT[h % CJK_BIGRAM_BUCKETS]);
+                    cnt[0]++;
+                });
+        } else {
+            byte[] uniT = unigramTables[li];
+            byte[] biT  = bigramTables[li];
+            byte[] triT = trigramTables[li];
+            extractNonCjkNgrams(preprocessed,
+                h -> {
+                    sum[0] += dequantize(uniT[h % NONCJK_UNIGRAM_BUCKETS]);
+                    cnt[0]++;
+                },
+                h -> {
+                    sum[0] += dequantize(biT[h % NONCJK_BIGRAM_BUCKETS]);
+                    cnt[0]++;
+                },
+                h -> {
+                    sum[0] += dequantize(triT[h % NONCJK_TRIGRAM_BUCKETS]);
+                    cnt[0]++;
+                });
+        }
+
+        return cnt[0] == 0 ? Float.NaN : (float) (sum[0] / cnt[0]);
+    }
+
+    /**
+     * Score {@code text} against all languages and return the best match.
+     *
+     * @return an entry {@code (languageCode, score)}, or {@code null} if no
+     *         language yields a finite score.
+     */
+    public Map.Entry<String, Float> bestMatch(String text) {
+        String best = null;
+        float  bestScore = Float.NEGATIVE_INFINITY;
+        for (String lang : langIds) {
+            float s = score(text, lang);
+            if (!Float.isNaN(s) && s > bestScore) {
+                bestScore = s;
+                best = lang;
+            }
+        }
+        return best == null ? null : Map.entry(best, bestScore);
+    }
+
+    /**
+     * Z-score of {@code text} under {@code language}:
+     * {@code (score(text, language) - μ) / σ}, where μ and σ were computed
+     * from the language's training corpus.
+     *
+     * <p>Appropriate when the input text is roughly the same length as
+     * training sentences.  For short or variable-length text, prefer
+     * {@link #zScoreLengthAdjusted}.
+     *
+     * @return the z-score, or {@link Float#NaN} if the language is unknown,
+     *         the text yields no scorable n-grams, or σ is zero/uncalibrated.
+     */
+    public float zScore(String text, String language) {
+        Integer li = langIndex.get(language);
+        if (li == null || scoreStdDevs[li] <= 0.0f) {
+            return Float.NaN;
+        }
+        float s = score(text, language);
+        if (Float.isNaN(s)) {
+            return Float.NaN;
+        }
+        return (s - scoreMeans[li]) / scoreStdDevs[li];
+    }
+
+    /**
+     * Approximate character length of a typical training sentence.
+     * Used by {@link #zScoreLengthAdjusted} to inflate σ for short text.
+     * Empirically derived from the calibration data: score σ scales as
+     * roughly 1/√(charLen) and stabilises around this length.
+     */
+    static final int CALIBRATION_CHAR_LENGTH = 120;
+
+    /** Floor on text length to avoid extreme σ inflation. */
+    static final int MIN_ADJUSTED_CHAR_LENGTH = 10;
+
+    /**
+     * Length-adjusted z-score of {@code text} under {@code language}.
+     *
+     * <p>Score variance scales as approximately 1/√(textLength).  The
+     * stored σ was calibrated on full training sentences (typically
+     * ~{@value #CALIBRATION_CHAR_LENGTH} characters).  For shorter text
+     * this method inflates σ proportionally, preventing spurious low
+     * z-scores on short snippets.  For text at or above the calibration
+     * length, the result equals {@link #zScore}.
+     *
+     * @return the adjusted z-score, or {@link Float#NaN} if the language
+     *         is unknown, the text yields no scorable n-grams, or σ is
+     *         zero/uncalibrated.
+     */
+    public float zScoreLengthAdjusted(String text, String language) {
+        Integer li = langIndex.get(language);
+        if (li == null || scoreStdDevs[li] <= 0.0f) {
+            return Float.NaN;
+        }
+        float s = score(text, language);
+        if (Float.isNaN(s)) {
+            return Float.NaN;
+        }
+        int textLen = text.length();
+        float adjustment = (float) Math.sqrt(
+                (double) CALIBRATION_CHAR_LENGTH
+                / Math.max(textLen, MIN_ADJUSTED_CHAR_LENGTH));
+        float adjustedSigma = scoreStdDevs[li] * Math.max(1.0f, adjustment);
+        return (s - scoreMeans[li]) / adjustedSigma;
+    }
+
+    /**
+     * Set the calibration statistics for a language. Typically called by
+     * the training tool after a second pass over the training corpus.
+     */
+    public void setStats(String language, float mean, float stdDev) {
+        Integer li = langIndex.get(language);
+        if (li == null) {
+            throw new IllegalArgumentException("Unknown language: " + 
language);
+        }
+        scoreMeans[li]   = mean;
+        scoreStdDevs[li] = stdDev;
+    }
+
+    // ---- N-gram extraction (shared by scoring and training) ----
+
+    /**
+     * Callback receiving a non-negative raw FNV hash for a single n-gram.
+     * The caller is responsible for reducing it modulo a table size.
+     */
+    @FunctionalInterface
+    public interface HashConsumer {
+        void consume(int hash);
+    }
+
+    /**
+     * Extract CJK character unigrams and bigrams from preprocessed text,
+     * delivering raw (positive) hashes to the supplied sinks.
+     */
+    public static void extractCjkNgrams(
+            String text,
+            HashConsumer unigramSink,
+            HashConsumer bigramSink) {
+        int prevCp = -1;
+        int i = 0;
+        int len = text.length();
+        while (i < len) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+            if (!Character.isLetter(cp)) {
+                prevCp = -1;
+                continue;
+            }
+            int lower = Character.toLowerCase(cp);
+            if (!ScriptAwareFeatureExtractor.isCjkOrKana(lower)) {
+                prevCp = -1;
+                continue;
+            }
+            int script = ScriptCategory.of(lower);
+            unigramSink.consume(cjkUnigramHash(script, lower));
+            if (prevCp >= 0) {
+                bigramSink.consume(bigramHash(script, prevCp, lower));
+            }
+            prevCp = lower;
+        }
+    }
+
+    /**
+     * Extract non-CJK character unigrams, sentinel-padded bigrams, and
+     * sentinel-padded trigrams from preprocessed text.
+     *
+     * <p>A "word" is a maximal run of non-CJK letter codepoints within the
+     * same script family. Sentinels ({@link #SENTINEL}) pad each word on
+     * both sides, so a word of length L yields L+1 bigrams and L+2 trigrams.
+     */
+    public static void extractNonCjkNgrams(
+            String text,
+            HashConsumer unigramSink,
+            HashConsumer bigramSink,
+            HashConsumer trigramSink) {
+        int  prevPrev  = SENTINEL;
+        int  prev      = SENTINEL;
+        int  prevScript = -1;
+        boolean inWord = false;
+
+        int i = 0;
+        int len = text.length();
+        while (i < len) {
+            int cp = text.codePointAt(i);
+            i += Character.charCount(cp);
+
+            if (cp >= 0x0300 && CharSoupFeatureExtractor.isTransparent(cp)) {
+                continue;
+            }
+
+            if (Character.isLetter(cp)) {
+                int lower  = Character.toLowerCase(cp);
+                if (ScriptAwareFeatureExtractor.isCjkOrKana(lower)) {
+                    if (inWord) {
+                        emitWordEnd(prevScript, prevPrev, prev, bigramSink, 
trigramSink);
+                        inWord = false;
+                        prevPrev = SENTINEL;
+                        prev = SENTINEL;
+                        prevScript = -1;
+                    }
+                    continue;
+                }
+                int script = ScriptCategory.of(lower);
+
+                if (inWord && script != prevScript) {
+                    // Script change is a word boundary
+                    emitWordEnd(prevScript, prevPrev, prev, bigramSink, 
trigramSink);
+                    inWord = false;
+                    prevPrev = SENTINEL;
+                    prev = SENTINEL;
+                }
+
+                unigramSink.consume(noncjkUnigramHash(script, lower));
+
+                if (!inWord) {
+                    // Leading sentinels
+                    bigramSink.consume(bigramHash(script, SENTINEL, lower));
+                    trigramSink.consume(trigramHash(script, SENTINEL, 
SENTINEL, lower));
+                    prevPrev = SENTINEL;
+                } else {
+                    bigramSink.consume(bigramHash(script, prev, lower));
+                    trigramSink.consume(trigramHash(script, prevPrev, prev, 
lower));
+                    prevPrev = prev;
+                }
+                prev = lower;
+                prevScript = script;
+                inWord = true;
+            } else {
+                if (inWord) {
+                    emitWordEnd(prevScript, prevPrev, prev, bigramSink, 
trigramSink);
+                    inWord = false;
+                    prevPrev = SENTINEL;
+                    prev = SENTINEL;
+                    prevScript = -1;
+                }
+            }
+        }
+
+        if (inWord) {
+            emitWordEnd(prevScript, prevPrev, prev, bigramSink, trigramSink);
+        }
+    }
+
+    private static void emitWordEnd(
+            int script, int pp, int p,
+            HashConsumer bigramSink, HashConsumer trigramSink) {
+        bigramSink.consume(bigramHash(script, p, SENTINEL));
+        trigramSink.consume(trigramHash(script, pp, p, SENTINEL));
+        trigramSink.consume(trigramHash(script, p, SENTINEL, SENTINEL));
+    }
+
+    // ---- Hash functions (FNV-1a) ----
+
+    static int cjkUnigramHash(int script, int cp) {
+        int h = CJK_UNIGRAM_BASIS;
+        h = fnvByte(h, script);
+        h = fnvInt(h, cp);
+        return h & 0x7FFFFFFF;
+    }
+
+    static int noncjkUnigramHash(int script, int cp) {
+        int h = NONCJK_UNIGRAM_BASIS;
+        h = fnvByte(h, script);
+        h = fnvInt(h, cp);
+        return h & 0x7FFFFFFF;
+    }
+
+    static int bigramHash(int script, int cp1, int cp2) {
+        int h = BIGRAM_BASIS;
+        h = fnvByte(h, script);
+        h = fnvInt(h, cp1);
+        h = fnvInt(h, cp2);
+        return h & 0x7FFFFFFF;
+    }
+
+    static int trigramHash(int script, int cp1, int cp2, int cp3) {
+        int h = TRIGRAM_BASIS;
+        h = fnvByte(h, script);
+        h = fnvInt(h, cp1);
+        h = fnvInt(h, cp2);
+        h = fnvInt(h, cp3);
+        return h & 0x7FFFFFFF;
+    }
+
+    private static int fnvByte(int h, int b) {
+        return (h ^ (b & 0xFF)) * 0x01000193;
+    }
+
+    private static int fnvInt(int h, int v) {
+        h = (h ^ (v         & 0xFF)) * 0x01000193;
+        h = (h ^ ((v >>>  8) & 0xFF)) * 0x01000193;
+        h = (h ^ ((v >>> 16) & 0xFF)) * 0x01000193;
+        h = (h ^ ((v >>> 24) & 0xFF)) * 0x01000193;
+        return h;
+    }
+
+    // ---- Quantization ----
+
+    /**
+     * Quantize a log-probability in [{@link #LOGP_MIN}, 0] to an unsigned byte
+     * value: 0 maps to {@code LOGP_MIN}, 255 maps to 0.
+     */
+    static byte quantize(float logP) {
+        float clamped = Math.max(LOGP_MIN, Math.min(0.0f, logP));
+        return (byte) Math.round((clamped - LOGP_MIN) / (-LOGP_MIN) * 255.0f);
+    }
+
+    /** Inverse of {@link #quantize}. */
+    static float dequantize(byte b) {
+        return (b & 0xFF) / 255.0f * (-LOGP_MIN) + LOGP_MIN;
+    }
+
+    // ---- Serialization ----
+
+    /**
+     * Deserialize a model from the GLM1 binary format.
+     */
+    public static GenerativeLanguageModel load(InputStream is) throws 
IOException {
+        DataInputStream din = new DataInputStream(new BufferedInputStream(is));
+
+        int magic = din.readInt();
+        if (magic != MAGIC) {
+            throw new IOException("Not a GLM1 file (bad magic)");
+        }
+        int version = din.readInt();
+        if (version != 1 && version != VERSION) {
+            throw new IOException("Unsupported GLM version: " + version);
+        }
+        boolean hasStats = version >= 2;
+
+        int numLangs        = din.readInt();
+        int cjkUni          = din.readInt();
+        int cjkBi           = din.readInt();
+        int noncjkUni       = din.readInt();
+        int noncjkBi        = din.readInt();
+        int noncjkTri       = din.readInt();
+
+        List<String> langIds      = new ArrayList<>(numLangs);
+        boolean[]    isCjk        = new boolean[numLangs];
+        byte[][]     unigramTables = new byte[numLangs][];
+        byte[][]     bigramTables  = new byte[numLangs][];
+        byte[][]     trigramTables = new byte[numLangs][];
+        float[]      means        = new float[numLangs];
+        float[]      stdDevs      = new float[numLangs];
+
+        for (int i = 0; i < numLangs; i++) {
+            int    codeLen   = din.readUnsignedShort();
+            byte[] codeBytes = new byte[codeLen];
+            din.readFully(codeBytes);
+            langIds.add(new String(codeBytes, StandardCharsets.UTF_8));
+
+            isCjk[i] = din.readByte() != 0;
+
+            if (hasStats) {
+                means[i]   = din.readFloat();
+                stdDevs[i] = din.readFloat();
+            }
+
+            int uniSize = isCjk[i] ? cjkUni    : noncjkUni;
+            int biSize  = isCjk[i] ? cjkBi     : noncjkBi;
+
+            unigramTables[i] = new byte[uniSize];
+            din.readFully(unigramTables[i]);
+
+            bigramTables[i] = new byte[biSize];
+            din.readFully(bigramTables[i]);
+
+            if (!isCjk[i]) {
+                trigramTables[i] = new byte[noncjkTri];
+                din.readFully(trigramTables[i]);
+            }
+        }
+
+        return new GenerativeLanguageModel(langIds, isCjk,
+                unigramTables, bigramTables, trigramTables,
+                means, stdDevs);
+    }
+
+    /**
+     * Serialize this model to the GLM1 binary format.
+     */
+    public void save(OutputStream os) throws IOException {
+        DataOutputStream dout = new DataOutputStream(new 
BufferedOutputStream(os));
+
+        dout.writeInt(MAGIC);
+        dout.writeInt(VERSION);
+        dout.writeInt(langIds.size());
+        dout.writeInt(CJK_UNIGRAM_BUCKETS);
+        dout.writeInt(CJK_BIGRAM_BUCKETS);
+        dout.writeInt(NONCJK_UNIGRAM_BUCKETS);
+        dout.writeInt(NONCJK_BIGRAM_BUCKETS);
+        dout.writeInt(NONCJK_TRIGRAM_BUCKETS);
+
+        for (int i = 0; i < langIds.size(); i++) {
+            byte[] codeBytes = langIds.get(i).getBytes(StandardCharsets.UTF_8);
+            dout.writeShort(codeBytes.length);
+            dout.write(codeBytes);
+            dout.writeByte(isCjk[i] ? 1 : 0);
+            dout.writeFloat(scoreMeans[i]);
+            dout.writeFloat(scoreStdDevs[i]);
+            dout.write(unigramTables[i]);
+            dout.write(bigramTables[i]);
+            if (!isCjk[i]) {
+                dout.write(trigramTables[i]);
+            }
+        }
+        dout.flush();
+    }
+
+    // ---- Builder ----
+
+    public static Builder builder() {
+        return new Builder();
+    }
+
+    /**
+     * Accumulates training samples per language and produces a
+     * {@link GenerativeLanguageModel} via add-k smoothing.
+     */
+    public static class Builder {
+
+        private final Map<String, Boolean> cjkFlags      = new 
LinkedHashMap<>();
+        private final Map<String, long[]>  unigramCounts = new HashMap<>();
+        private final Map<String, long[]>  bigramCounts  = new HashMap<>();
+        private final Map<String, long[]>  trigramCounts = new HashMap<>();
+
+        /**
+         * Register a language before feeding it samples.  Must be called
+         * before {@link #addSample(String, String)}.
+         */
+        public Builder registerLanguage(String langCode, boolean isCjk) {
+            cjkFlags.put(langCode, isCjk);
+            unigramCounts.put(langCode,
+                    new long[isCjk ? CJK_UNIGRAM_BUCKETS : 
NONCJK_UNIGRAM_BUCKETS]);
+            bigramCounts.put(langCode,
+                    new long[isCjk ? CJK_BIGRAM_BUCKETS  : 
NONCJK_BIGRAM_BUCKETS]);
+            if (!isCjk) {
+                trigramCounts.put(langCode, new long[NONCJK_TRIGRAM_BUCKETS]);
+            }
+            return this;
+        }
+
+        /**
+         * Add a text sample for the named language.  The language must have
+         * been registered via {@link #registerLanguage} first.
+         */
+        public Builder addSample(String langCode, String text) {
+            Boolean cjk = cjkFlags.get(langCode);
+            if (cjk == null) {
+                throw new IllegalArgumentException("Unknown language: " + 
langCode);
+            }
+            String pp = CharSoupFeatureExtractor.preprocess(text);
+            if (pp.isEmpty()) {
+                return this;
+            }
+
+            long[] ug = unigramCounts.get(langCode);
+            long[] bg = bigramCounts.get(langCode);
+
+            if (cjk) {
+                extractCjkNgrams(pp,
+                        h -> ug[h % CJK_UNIGRAM_BUCKETS]++,
+                        h -> bg[h % CJK_BIGRAM_BUCKETS]++);
+            } else {
+                long[] tg = trigramCounts.get(langCode);
+                extractNonCjkNgrams(pp,
+                        h -> ug[h % NONCJK_UNIGRAM_BUCKETS]++,
+                        h -> bg[h % NONCJK_BIGRAM_BUCKETS]++,
+                        h -> tg[h % NONCJK_TRIGRAM_BUCKETS]++);
+            }
+            return this;
+        }
+
+        /**
+         * Finalize training with add-{@code k} smoothing and return the model.
+         *
+         * @param addK smoothing constant; 0.01 is a reasonable default
+         */
+        public GenerativeLanguageModel build(float addK) {
+            List<String> ids  = new ArrayList<>(cjkFlags.keySet());
+            int n = ids.size();
+
+            boolean[] cjkArr    = new boolean[n];
+            byte[][]  uniTables = new byte[n][];
+            byte[][]  biTables  = new byte[n][];
+            byte[][]  triTables = new byte[n][];
+
+            for (int i = 0; i < n; i++) {
+                String lang = ids.get(i);
+                cjkArr[i]  = cjkFlags.get(lang);
+                uniTables[i] = toLogProbTable(unigramCounts.get(lang), addK);
+                biTables[i]  = toLogProbTable(bigramCounts.get(lang),  addK);
+                if (!cjkArr[i]) {
+                    triTables[i] = toLogProbTable(trigramCounts.get(lang), 
addK);
+                }
+            }
+            return new GenerativeLanguageModel(ids, cjkArr, uniTables, 
biTables, triTables,
+                    new float[n], new float[n]);
+        }
+
+        private static byte[] toLogProbTable(long[] counts, float addK) {
+            long total = 0;
+            for (long c : counts) {
+                total += c;
+            }
+            double denom = total + (double) addK * counts.length;
+            byte[] table = new byte[counts.length];
+            for (int i = 0; i < counts.length; i++) {
+                double p = (counts[i] + addK) / denom;
+                table[i] = quantize((float) Math.log(p));
+            }
+            return table;
+        }
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
index 78758a62d6..2680c11a70 100644
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
@@ -278,7 +278,7 @@ public class ScriptAwareFeatureExtractor implements 
FeatureExtractor {
                 == Character.SPACE_SEPARATOR;
     }
 
-    static boolean isCjkOrKana(int cp) {
+    public static boolean isCjkOrKana(int cp) {
         if (Character.isIdeographic(cp)) {
             return true;
         }
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java
new file mode 100644
index 0000000000..b6e44eacc6
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java
@@ -0,0 +1,274 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Locale;
+import java.util.Set;
+
+import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor;
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor;
+
+/**
+ * Measures corpus diversity for each language in a flat-file corpus directory.
+ *
+ * <p>Three complementary metrics are computed entirely from the training
+ * sentences — no external evaluation set required:
+ *
+ * <ol>
+ *   <li><b>Bigram bucket fill %</b>: fraction of the bigram hash table that
+ *       has at least one count after seeing all training sentences.  A corpus
+ *       of near-identical stubs reuses the same n-grams over and over and
+ *       fills a small fraction of buckets regardless of corpus size.</li>
+ *   <li><b>Normalised bigram entropy</b>: Shannon entropy of the bigram count
+ *       distribution divided by log2(filled buckets).  A perfectly uniform
+ *       distribution scores 1.0; a corpus dominated by a handful of repeated
+ *       patterns scores near 0.</li>
+ *   <li><b>Unique sentence %</b>: fraction of distinct lines.  Templated
+ *       corpora have many near- or exact-duplicate sentences.</li>
+ * </ol>
+ *
+ * <p>Languages whose fill% and entropy fall far below the median are flagged
+ * as potentially low-quality.
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   java CorpusDiversityAnalyzer \
+ *       --corpus /path/to/pool_filtered \
+ *       [--max-per-lang 100000] \
+ *       [--flag-below 0.5]
+ * </pre>
+ */
+public class CorpusDiversityAnalyzer {
+
+    private static final int DEFAULT_MAX_PER_LANG = 100_000;
+    private static final double DEFAULT_FLAG_BELOW = 0.5;
+
+    public static void main(String[] args) throws Exception {
+        Path   corpus     = null;
+        int    maxPerLang = DEFAULT_MAX_PER_LANG;
+        double flagBelow  = DEFAULT_FLAG_BELOW;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--corpus":
+                    corpus = Paths.get(args[++i]);
+                    break;
+                case "--max-per-lang":
+                    maxPerLang = Integer.parseInt(args[++i]);
+                    break;
+                case "--flag-below":
+                    flagBelow = Double.parseDouble(args[++i]);
+                    break;
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    System.exit(1);
+            }
+        }
+        if (corpus == null) {
+            System.err.println("Usage: CorpusDiversityAnalyzer --corpus <dir> "
+                    + "[--max-per-lang N] [--flag-below 0.5]");
+            System.exit(1);
+        }
+
+        List<Path> langPaths = listRegularFiles(corpus);
+        System.out.printf(Locale.US, "Analysing %d languages in %s  "
+                + "(max %,d sentences each)%n%n",
+                langPaths.size(), corpus, maxPerLang);
+
+        System.out.printf(Locale.US,
+                "%-14s  %10s  %10s  %8s  %10s  %10s  %s%n",
+                "Language", "Sentences", "Unique%",
+                "Fill%", "Entropy", "NormEntropy", "Flag");
+        System.out.println("-".repeat(80));
+
+        List<LangStats> stats = new ArrayList<>();
+        for (Path p : langPaths) {
+            LangStats s = analyze(p, maxPerLang);
+            stats.add(s);
+        }
+
+        // Sort by normalised entropy ascending (worst first)
+        stats.sort((a, b) -> Double.compare(a.normEntropy, b.normEntropy));
+
+        for (LangStats s : stats) {
+            String flag = (s.fillPct < flagBelow * 100
+                    || s.normEntropy < flagBelow) ? "  <<< LOW DIVERSITY" : "";
+            System.out.printf(Locale.US,
+                    "%-14s  %,10d  %9.1f%%  %7.1f%%  %9.3f  %11.3f  %s%n",
+                    s.lang, s.sentences, s.uniquePct,
+                    s.fillPct, s.entropy, s.normEntropy, flag);
+        }
+    }
+
+    // ---- Analysis ----
+
+    static LangStats analyze(Path langFile, int maxPerLang) throws IOException 
{
+        String lang = langFile.getFileName().toString();
+
+        // Determine CJK by probing first 200 sentences
+        boolean cjk = probeCjk(langFile, 200);
+
+        int numBuckets = cjk
+                ? GenerativeLanguageModel.CJK_BIGRAM_BUCKETS
+                : GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS;
+
+        long[] bigramCounts = new long[numBuckets];
+        Set<String> seen    = new HashSet<>();
+        long sentences      = 0;
+        long uniqueSentences = 0;
+
+        try (BufferedReader reader = Files.newBufferedReader(
+                langFile, StandardCharsets.UTF_8)) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                String text = line.trim();
+                if (text.isEmpty()) continue;
+
+                String pp = CharSoupFeatureExtractor.preprocess(text);
+                if (pp.isEmpty()) continue;
+
+                if (seen.add(text)) {
+                    uniqueSentences++;
+                }
+                sentences++;
+
+                if (cjk) {
+                    GenerativeLanguageModel.extractCjkNgrams(pp,
+                            h -> { /* skip unigrams */ },
+                            h -> bigramCounts[h
+                                    % 
GenerativeLanguageModel.CJK_BIGRAM_BUCKETS]++);
+                } else {
+                    GenerativeLanguageModel.extractNonCjkNgrams(pp,
+                            h -> { /* skip unigrams */ },
+                            h -> bigramCounts[h
+                                    % 
GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS]++,
+                            h -> { /* skip trigrams */ });
+                }
+
+                if (maxPerLang > 0 && sentences >= maxPerLang) {
+                    break;
+                }
+            }
+        }
+
+        // Metrics
+        long filledBuckets = 0;
+        long total         = 0;
+        for (long c : bigramCounts) {
+            if (c > 0) {
+                filledBuckets++;
+                total += c;
+            }
+        }
+
+        double fillPct = 100.0 * filledBuckets / numBuckets;
+
+        // Shannon entropy over filled buckets (bits)
+        double entropy = 0.0;
+        if (total > 0) {
+            for (long c : bigramCounts) {
+                if (c > 0) {
+                    double p = (double) c / total;
+                    entropy -= p * (Math.log(p) / Math.log(2));
+                }
+            }
+        }
+
+        // Normalised entropy: H / log2(filledBuckets)  ∈ [0, 1]
+        double normEntropy = filledBuckets > 1
+                ? entropy / (Math.log(filledBuckets) / Math.log(2)) : 0.0;
+
+        double uniquePct = sentences > 0
+                ? 100.0 * uniqueSentences / sentences : 0.0;
+
+        return new LangStats(lang, sentences, uniquePct,
+                fillPct, entropy, normEntropy);
+    }
+
+    // ---- Helpers ----
+
+    private static boolean probeCjk(Path file, int maxLines) throws 
IOException {
+        long cjk   = 0;
+        long total = 0;
+        int  lines  = 0;
+        try (BufferedReader reader = Files.newBufferedReader(
+                file, StandardCharsets.UTF_8)) {
+            String line;
+            while ((line = reader.readLine()) != null && lines < maxLines) {
+                int i = 0;
+                while (i < line.length()) {
+                    int cp = line.codePointAt(i);
+                    i += Character.charCount(cp);
+                    if (Character.isLetter(cp)) {
+                        total++;
+                        if (ScriptAwareFeatureExtractor.isCjkOrKana(
+                                Character.toLowerCase(cp))) {
+                            cjk++;
+                        }
+                    }
+                }
+                lines++;
+            }
+        }
+        return total > 0 && (double) cjk / total >= 0.60;
+    }
+
+    private static List<Path> listRegularFiles(Path dir) throws IOException {
+        List<Path> files = new ArrayList<>();
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(
+                dir, Files::isRegularFile)) {
+            for (Path p : stream) {
+                files.add(p);
+            }
+        }
+        Collections.sort(files);
+        return files;
+    }
+
+    // ---- Result record ----
+
+    static class LangStats {
+        final String lang;
+        final long   sentences;
+        final double uniquePct;
+        final double fillPct;
+        final double entropy;
+        final double normEntropy;
+
+        LangStats(String lang, long sentences, double uniquePct,
+                  double fillPct, double entropy, double normEntropy) {
+            this.lang        = lang;
+            this.sentences   = sentences;
+            this.uniquePct   = uniquePct;
+            this.fillPct     = fillPct;
+            this.entropy     = entropy;
+            this.normEntropy = normEntropy;
+        }
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java
new file mode 100644
index 0000000000..6acd5573a9
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java
@@ -0,0 +1,253 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+
+/**
+ * Scores every sentence in a training corpus against its own language model
+ * and reports how many would be dropped at various z-score thresholds.
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   java CorpusFilterReport \
+ *       --model  generative.bin \
+ *       --corpus /path/to/pool_filtered \
+ *       [--max-per-lang 500000] \
+ *       [--show-drops 10]   (print N worst-scoring sentences per language)
+ * </pre>
+ */
+public class CorpusFilterReport {
+
+    private static final int DEFAULT_MAX_PER_LANG = 500_000;
+    private static final int DEFAULT_SHOW_DROPS   = 0;
+
+    public static void main(String[] args) throws Exception {
+        Path   modelPath  = null;
+        Path   corpusPath = null;
+        int    maxPerLang = DEFAULT_MAX_PER_LANG;
+        int    showDrops  = DEFAULT_SHOW_DROPS;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--corpus":
+                    corpusPath = Paths.get(args[++i]);
+                    break;
+                case "--max-per-lang":
+                    maxPerLang = Integer.parseInt(args[++i]);
+                    break;
+                case "--show-drops":
+                    showDrops = Integer.parseInt(args[++i]);
+                    break;
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    System.exit(1);
+            }
+        }
+
+        if (modelPath == null || corpusPath == null) {
+            System.err.println(
+                    "Usage: CorpusFilterReport --model <bin> --corpus <dir> "
+                    + "[--max-per-lang N] [--show-drops N]");
+            System.exit(1);
+        }
+
+        GenerativeLanguageModel model;
+        try (InputStream is = new FileInputStream(modelPath.toFile())) {
+            model = GenerativeLanguageModel.load(is);
+        }
+
+        boolean flat = isFlatLayout(corpusPath);
+        List<Path> langPaths = listLangPaths(corpusPath, flat);
+
+        System.out.printf(Locale.US,
+                "%-14s  %8s  %8s  %8s  %8s  %8s  %8s%n",
+                "Language", "Total", "z<-2", "z<-3", "z<-4", "z<-2%", "z<-3%");
+        System.out.println("-".repeat(80));
+
+        long grandTotal = 0;
+        long grandDrop2 = 0;
+        long grandDrop3 = 0;
+        long grandDrop4 = 0;
+
+        for (Path langPath : langPaths) {
+            String lang = langPath.getFileName().toString();
+            if (!model.getLanguages().contains(lang)) {
+                continue;
+            }
+
+            List<ScoredLine> scored = scoreLang(
+                    model, lang, langPath, flat, maxPerLang);
+
+            long total = scored.size();
+            long drop2 = 0;
+            long drop3 = 0;
+            long drop4 = 0;
+            for (ScoredLine sl : scored) {
+                if (sl.z < -2) {
+                    drop2++;
+                }
+                if (sl.z < -3) {
+                    drop3++;
+                }
+                if (sl.z < -4) {
+                    drop4++;
+                }
+            }
+
+            System.out.printf(Locale.US,
+                    "%-14s  %,8d  %,8d  %,8d  %,8d  %7.2f%%  %7.2f%%%n",
+                    lang, total, drop2, drop3, drop4,
+                    100.0 * drop2 / total, 100.0 * drop3 / total);
+
+            if (showDrops > 0 && drop3 > 0) {
+                scored.sort((a, b) -> Float.compare(a.z, b.z));
+                int n = (int) Math.min(showDrops, drop3);
+                for (int i = 0; i < n; i++) {
+                    ScoredLine sl = scored.get(i);
+                    String preview = sl.text.length() > 80
+                            ? sl.text.substring(0, 80) + "…" : sl.text;
+                    System.out.printf(Locale.US,
+                            "    z=%6.2f  %s%n", sl.z, preview);
+                }
+            }
+
+            grandTotal += total;
+            grandDrop2 += drop2;
+            grandDrop3 += drop3;
+            grandDrop4 += drop4;
+        }
+
+        System.out.println("-".repeat(80));
+        System.out.printf(Locale.US,
+                "%-14s  %,8d  %,8d  %,8d  %,8d  %7.2f%%  %7.2f%%%n",
+                "TOTAL", grandTotal, grandDrop2, grandDrop3, grandDrop4,
+                100.0 * grandDrop2 / grandTotal, 100.0 * grandDrop3 / 
grandTotal);
+    }
+
+    private static List<ScoredLine> scoreLang(
+            GenerativeLanguageModel model, String lang,
+            Path langPath, boolean flat, int maxPerLang) throws Exception {
+
+        List<ScoredLine> result = new ArrayList<>();
+
+        if (flat) {
+            try (BufferedReader reader = Files.newBufferedReader(
+                    langPath, StandardCharsets.UTF_8)) {
+                String line;
+                while ((line = reader.readLine()) != null) {
+                    String text = line.trim();
+                    if (text.isEmpty()) {
+                        continue;
+                    }
+                    float z = model.zScore(text, lang);
+                    if (!Float.isNaN(z)) {
+                        result.add(new ScoredLine(text, z));
+                    }
+                    if (maxPerLang > 0 && result.size() >= maxPerLang) {
+                        break;
+                    }
+                }
+            }
+        } else {
+            List<Path> files = listTxtFiles(langPath);
+            outer:
+            for (Path file : files) {
+                try (BufferedReader reader = Files.newBufferedReader(
+                        file, StandardCharsets.UTF_8)) {
+                    String line;
+                    while ((line = reader.readLine()) != null) {
+                        int tab = line.indexOf('\t');
+                        if (tab < 0) {
+                            continue;
+                        }
+                        String text = line.substring(tab + 1).trim();
+                        if (text.isEmpty()) {
+                            continue;
+                        }
+                        float z = model.zScore(text, lang);
+                        if (!Float.isNaN(z)) {
+                            result.add(new ScoredLine(text, z));
+                        }
+                        if (maxPerLang > 0 && result.size() >= maxPerLang) {
+                            break outer;
+                        }
+                    }
+                }
+            }
+        }
+        return result;
+    }
+
+    private static boolean isFlatLayout(Path dir) throws Exception {
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) {
+            for (Path p : stream) {
+                return Files.isRegularFile(p);
+            }
+        }
+        return true;
+    }
+
+    private static List<Path> listLangPaths(Path dir, boolean flat) throws 
Exception {
+        List<Path> paths = new ArrayList<>();
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir,
+                p -> flat ? Files.isRegularFile(p) : Files.isDirectory(p))) {
+            for (Path p : stream) {
+                paths.add(p);
+            }
+        }
+        Collections.sort(paths);
+        return paths;
+    }
+
+    private static List<Path> listTxtFiles(Path dir) throws Exception {
+        List<Path> files = new ArrayList<>();
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, 
"*.txt")) {
+            for (Path p : stream) {
+                files.add(p);
+            }
+        }
+        Collections.sort(files);
+        return files;
+    }
+
+    private static class ScoredLine {
+        final String text;
+        final float  z;
+
+        ScoredLine(String text, float z) {
+            this.text = text;
+            this.z = z;
+        }
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java
new file mode 100644
index 0000000000..4a4652dc57
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java
@@ -0,0 +1,365 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Set;
+
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+
+/**
+ * Self-consistency evaluation for {@link GenerativeLanguageModel}.
+ *
+ * <p>For each sentence in the test file, computes {@code score(text, L)}
+ * for every language in the model and checks whether the argmax equals
+ * the true label.  Reports overall accuracy and per-language accuracy
+ * sorted from worst to best.
+ *
+ * <p>Accepts either:
+ * <ul>
+ *   <li>Flores-200 TSV: {@code lang_Script TAB text} — script suffixes are
+ *       stripped and FLORES-specific codes are remapped to model codes.</li>
+ *   <li>Standard corpus format: {@code lang TAB text}</li>
+ * </ul>
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   java EvalGenerativeModel \
+ *       --model  generative.bin \
+ *       --test   /path/to/flores200_dev.tsv \
+ *       [--max-per-lang 997]
+ * </pre>
+ */
+public class EvalGenerativeModel {
+
+    private static final int DEFAULT_MAX_PER_LANG = 0; // 0 = unlimited
+    private static final int DEFAULT_MAX_CHARS    = 0; // 0 = full sentence
+
+    // ---- Flores-200 normalisation (mirrors CompareDetectors) ----
+
+    private static final Set<String> FLORES_KEEP_SCRIPT_SUFFIX = Set.of(
+            "ace_Arab", "arb_Latn", "bjn_Arab",
+            "kas_Deva", "knc_Latn", "min_Arab", "taq_Tfng"
+    );
+
+    private static final Map<String, String> FLORES_CODE_REMAP;
+    static {
+        Map<String, String> m = new HashMap<>();
+        m.put("arb", "ara");
+        m.put("pes", "fas");
+        m.put("zsm", "msa");
+        m.put("lvs", "lav");
+        m.put("azj", "aze");
+        m.put("ekk", "est");
+        m.put("npi", "nep");
+        m.put("als", "sqi");
+        m.put("ory", "ori");
+        m.put("nor", "nob");
+        m.put("cmn", "zho");
+        m.put("swa", "swh");
+        m.put("yid", "ydd");
+        m.put("gug", "grn");
+        m.put("quz", "que");
+        m.put("plt", "mlg");
+        m.put("pbt", "pus");
+        m.put("uzn", "uzb");
+        m.put("kmr", "kur");
+        m.put("khk", "mon");
+        FLORES_CODE_REMAP = m;
+    }
+
+    static String normalizeLang(String raw) {
+        if (FLORES_KEEP_SCRIPT_SUFFIX.contains(raw)) {
+            return raw;
+        }
+        int underscore = raw.indexOf('_');
+        String base = underscore >= 0 ? raw.substring(0, underscore) : raw;
+        return FLORES_CODE_REMAP.getOrDefault(base, base);
+    }
+
+    // ---- Entry point ----
+
+    public static void main(String[] args) throws Exception {
+        Path  modelPath  = null;
+        Path  testPath   = null;
+        int     maxPerLang      = DEFAULT_MAX_PER_LANG;
+        int[]   maxCharsSet     = null; // null = full sentence only
+        boolean showConfusions  = false;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--test":
+                    testPath = Paths.get(args[++i]);
+                    break;
+                case "--max-per-lang":
+                    maxPerLang = Integer.parseInt(args[++i]);
+                    break;
+                case "--show-confusions":
+                    showConfusions = true;
+                    break;
+                case "--lengths": {
+                    String[] parts = args[++i].split(",");
+                    maxCharsSet = new int[parts.length];
+                    for (int j = 0; j < parts.length; j++) {
+                        maxCharsSet[j] = Integer.parseInt(parts[j].trim());
+                    }
+                    break;
+                }
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    printUsage();
+                    System.exit(1);
+            }
+        }
+
+        if (modelPath == null || testPath == null) {
+            printUsage();
+            System.exit(1);
+        }
+
+        System.out.println("Loading model: " + modelPath);
+        GenerativeLanguageModel model;
+        try (InputStream is = new FileInputStream(modelPath.toFile())) {
+            model = GenerativeLanguageModel.load(is);
+        }
+        System.out.printf(Locale.US, "  %d languages (%d CJK, %d non-CJK)%n",
+                model.getLanguages().size(),
+                model.getLanguages().stream().filter(model::isCjk).count(),
+                model.getLanguages().stream().filter(l -> 
!model.isCjk(l)).count());
+
+        System.out.println("Loading test data: " + testPath);
+        List<LabeledSentence> data = loadTestFile(testPath);
+        boolean floresMode = data.stream().anyMatch(s -> 
s.getLanguage().contains("_"));
+        if (floresMode) {
+            System.out.println("  Flores-200 mode: normalizing lang codes");
+            List<LabeledSentence> normalized = new ArrayList<>(data.size());
+            for (LabeledSentence s : data) {
+                normalized.add(new LabeledSentence(
+                        normalizeLang(s.getLanguage()), s.getText()));
+            }
+            data = normalized;
+        }
+
+        // Cap per language if requested
+        if (maxPerLang > 0) {
+            data = samplePerLang(data, maxPerLang);
+        }
+
+        Set<String> modelLangs = new java.util.HashSet<>(model.getLanguages());
+
+        // Split into scorable (true lang is in model) and unscorable
+        List<LabeledSentence> scorable   = new ArrayList<>();
+        Map<String, Integer>  skipped    = new HashMap<>();
+        for (LabeledSentence s : data) {
+            if (modelLangs.contains(s.getLanguage())) {
+                scorable.add(s);
+            } else {
+                skipped.merge(s.getLanguage(), 1, Integer::sum);
+            }
+        }
+        System.out.printf(Locale.US, "  %,d sentences; %,d scorable, %,d 
skipped (%d langs not in model)%n",
+                data.size(), scorable.size(),
+                data.size() - scorable.size(), skipped.size());
+        if (!skipped.isEmpty()) {
+            List<String> sk = new ArrayList<>(skipped.keySet());
+            java.util.Collections.sort(sk);
+            System.out.println("  Skipped langs: " + sk);
+        }
+
+        // Build the set of lengths to evaluate
+        int[] lengths = maxCharsSet != null ? maxCharsSet : new int[]{0};
+
+        for (int maxChars : lengths) {
+            String label = maxChars > 0 ? "@" + maxChars + " chars" : "full";
+            List<LabeledSentence> run = maxChars > 0
+                    ? truncate(scorable, maxChars) : scorable;
+
+            System.out.printf(Locale.US, "%nScoring [%s]…%n", label);
+            long wallStart = System.nanoTime();
+            // confusions: trueLang -> (predictedLang -> count)
+            Map<String, Map<String, Integer>> confusions =
+                    showConfusions ? new java.util.TreeMap<>() : null;
+            Map<String, int[]> perLang = evalAll(model, run, confusions);
+            long elapsedMs = (System.nanoTime() - wallStart) / 1_000_000;
+
+            int totalCorrect = 0;
+            int totalCount   = 0;
+            for (int[] v : perLang.values()) {
+                totalCorrect += v[0];
+                totalCount   += v[1];
+            }
+
+            System.out.printf(Locale.US,
+                    "Overall [%s]: %.2f%%  (%,d / %,d)  in %,dms (%.0f 
sent/s)%n",
+                    label, 100.0 * totalCorrect / totalCount,
+                    totalCorrect, totalCount,
+                    elapsedMs, totalCount * 1000.0 / elapsedMs);
+
+            List<Map.Entry<String, int[]>> rows = new 
ArrayList<>(perLang.entrySet());
+            rows.sort(Comparator.comparingDouble(
+                    e -> (double) e.getValue()[0] / e.getValue()[1]));
+
+            System.out.printf(Locale.US, "%n%-16s  %8s  %8s  %8s%n",
+                    "Language", "Correct", "Total", "Acc%");
+            System.out.println("-".repeat(46));
+            for (Map.Entry<String, int[]> e : rows) {
+                int[] v = e.getValue();
+                System.out.printf(Locale.US, "%-16s  %8d  %8d  %7.2f%%%n",
+                        e.getKey(), v[0], v[1], 100.0 * v[0] / v[1]);
+            }
+
+            System.out.println();
+            int[] thresholds = {100, 95, 90, 80, 50};
+            for (int t : thresholds) {
+                long above = rows.stream()
+                        .filter(e -> 100.0 * e.getValue()[0] / e.getValue()[1] 
>= t)
+                        .count();
+                System.out.printf(Locale.US, "  >= %3d%% accuracy: %3d / %d 
languages%n",
+                        t, above, rows.size());
+            }
+
+            if (confusions != null) {
+                System.out.println("\n=== Confusion distributions (wrong 
predictions only) ===");
+                for (Map.Entry<String, Map<String, Integer>> langEntry
+                        : confusions.entrySet()) {
+                    String trueLang = langEntry.getKey();
+                    Map<String, Integer> preds = langEntry.getValue();
+                    int total = perLang.get(trueLang)[1];
+                    int correct = perLang.get(trueLang)[0];
+                    int wrong = total - correct;
+                    if (wrong == 0) continue;
+                    System.out.printf(Locale.US,
+                            "%n  %s (%d wrong / %d total):  ",
+                            trueLang, wrong, total);
+                    preds.entrySet().stream()
+                            .sorted(Map.Entry.<String, 
Integer>comparingByValue()
+                                    .reversed())
+                            .limit(10)
+                            .forEach(e -> System.out.printf(Locale.US,
+                                    "%s=%d ", e.getKey(), e.getValue()));
+                    System.out.println();
+                }
+            }
+        }
+    }
+
+    // ---- Scoring ----
+
+    private static Map<String, int[]> evalAll(
+            GenerativeLanguageModel model,
+            List<LabeledSentence> data,
+            Map<String, Map<String, Integer>> confusions) {
+        Map<String, int[]> perLang = new HashMap<>();
+        List<String> allLangs = model.getLanguages();
+
+        for (LabeledSentence s : data) {
+            String trueLang  = s.getLanguage();
+            String predicted = argmax(model, allLangs, s.getText());
+            int[]  counts    = perLang.computeIfAbsent(trueLang, k -> new 
int[2]);
+            counts[1]++;
+            if (trueLang.equals(predicted)) {
+                counts[0]++;
+            } else if (confusions != null && predicted != null) {
+                confusions.computeIfAbsent(trueLang, k -> new HashMap<>())
+                        .merge(predicted, 1, Integer::sum);
+            }
+        }
+        return perLang;
+    }
+
+    private static String argmax(GenerativeLanguageModel model,
+                                  List<String> langs, String text) {
+        String best  = null;
+        float  bestS = Float.NEGATIVE_INFINITY;
+        for (String lang : langs) {
+            float s = model.score(text, lang);
+            if (!Float.isNaN(s) && s > bestS) {
+                bestS = s;
+                best  = lang;
+            }
+        }
+        return best;
+    }
+
+    // ---- I/O helpers ----
+
+    static List<LabeledSentence> loadTestFile(Path path) throws Exception {
+        List<LabeledSentence> sentences = new ArrayList<>();
+        try (BufferedReader reader = Files.newBufferedReader(path, 
StandardCharsets.UTF_8)) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                int tab = line.indexOf('\t');
+                if (tab < 0) {
+                    continue;
+                }
+                String lang = line.substring(0, tab).trim();
+                String text = line.substring(tab + 1).trim();
+                if (!lang.isEmpty() && !text.isEmpty()) {
+                    sentences.add(new LabeledSentence(lang, text));
+                }
+            }
+        }
+        return sentences;
+    }
+
+    private static List<LabeledSentence> truncate(
+            List<LabeledSentence> data, int maxChars) {
+        List<LabeledSentence> result = new ArrayList<>(data.size());
+        for (LabeledSentence s : data) {
+            String t = s.getText();
+            result.add(new LabeledSentence(s.getLanguage(),
+                    t.length() > maxChars ? t.substring(0, maxChars) : t));
+        }
+        return result;
+    }
+
+    private static List<LabeledSentence> samplePerLang(
+            List<LabeledSentence> data, int max) {
+        Map<String, Integer> counts = new HashMap<>();
+        List<LabeledSentence> result = new ArrayList<>();
+        for (LabeledSentence s : data) {
+            int n = counts.merge(s.getLanguage(), 1, Integer::sum);
+            if (n <= max) {
+                result.add(s);
+            }
+        }
+        return result;
+    }
+
+    private static void printUsage() {
+        System.err.println("Usage: EvalGenerativeModel");
+        System.err.println("         --model <generative.bin>");
+        System.err.println("         --test  <testFile.tsv>");
+        System.err.println("         [--max-per-lang <N>]");
+        System.err.println("         [--lengths 50,100,200]  (truncate 
sentences to N chars)");
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java
new file mode 100644
index 0000000000..8143c1376f
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java
@@ -0,0 +1,228 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+
+/**
+ * Evaluates the generative model as a training-data filter for a single
+ * target language.
+ *
+ * <p>Constructs a synthetic contaminated corpus from a FLORES-200 TSV:
+ * all sentences labelled as {@code --lang} are <em>signal</em> (should be
+ * kept); sentences from all other languages present in the model are
+ * <em>noise</em> (should be dropped).
+ *
+ * <p>The filter z-scores each sentence against the target language's model:
+ * <pre>
+ *   z = (score(sentence, targetLang) - μ) / σ
+ *   keep = z >= threshold
+ * </pre>
+ * where μ and σ are the mean/stddev of scores on the language's training
+ * corpus (baked into the model file).
+ *
+ * <p>Sweeping {@code threshold} traces a precision/recall curve: lower
+ * threshold = permissive filter (keeps more, misses more noise); higher
+ * threshold = strict filter (drops more noise but may drop real signal).
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   mvn -pl tika-langdetect/tika-langdetect-charsoup exec:java \
+ *       -Dexec.mainClass=...tools.FilterBenchmark \
+ *       -Dexec.args="--model /path/generative.bin \
+ *                    --test  /path/flores200_dev.tsv \
+ *                    --lang  zho \
+ *                    [--noise-ratio 1.0] \
+ *                    [--steps 20]"
+ * </pre>
+ */
+public class FilterBenchmark {
+
+    private static final double DEFAULT_NOISE_RATIO = 1.0;
+    private static final int    DEFAULT_STEPS       = 20;
+
+    public static void main(String[] args) throws Exception {
+        Path   modelPath  = null;
+        Path   testPath   = null;
+        String targetLang = null;
+        double noiseRatio = DEFAULT_NOISE_RATIO;
+        int    steps      = DEFAULT_STEPS;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--test":
+                    testPath = Paths.get(args[++i]);
+                    break;
+                case "--lang":
+                    targetLang = args[++i];
+                    break;
+                case "--noise-ratio":
+                    noiseRatio = Double.parseDouble(args[++i]);
+                    break;
+                case "--steps":
+                    steps = Integer.parseInt(args[++i]);
+                    break;
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    System.exit(1);
+            }
+        }
+
+        if (modelPath == null || testPath == null || targetLang == null) {
+            System.err.println(
+                    "Usage: FilterBenchmark --model <bin> --test <flores.tsv> 
--lang <code> "
+                    + "[--noise-ratio 1.0] [--steps 20]");
+            System.exit(1);
+        }
+
+        System.out.println("Loading model: " + modelPath);
+        GenerativeLanguageModel model;
+        try (InputStream is = new FileInputStream(modelPath.toFile())) {
+            model = GenerativeLanguageModel.load(is);
+        }
+
+        if (!model.getLanguages().contains(targetLang)) {
+            System.err.printf(Locale.US,
+                    "Language '%s' not found in model. Available: %s%n",
+                    targetLang, model.getLanguages());
+            System.exit(1);
+        }
+
+        System.out.println("Loading test data: " + testPath);
+        List<LabeledSentence> all = EvalGenerativeModel.loadTestFile(testPath);
+
+        List<String> signal = new ArrayList<>();
+        List<String> noise  = new ArrayList<>();
+        for (LabeledSentence s : all) {
+            String lang = EvalGenerativeModel.normalizeLang(s.getLanguage());
+            if (targetLang.equals(lang)) {
+                signal.add(s.getText());
+            } else {
+                // include all other languages as potential noise
+                noise.add(s.getText());
+            }
+        }
+
+        int noiseCount = (int) Math.min(noise.size(),
+                Math.round(signal.size() * noiseRatio));
+        noise = noise.subList(0, noiseCount);
+
+        System.out.printf(Locale.US,
+                "Target: %s  |  signal: %,d  |  noise: %,d  (%.1fx ratio)%n%n",
+                targetLang, signal.size(), noiseCount,
+                (double) noiseCount / signal.size());
+
+        // Z-score every sentence against the target language model
+        float[] sigZ   = zScores(model, targetLang, signal);
+        float[] noiseZ = zScores(model, targetLang, noise);
+
+        // Sweep range: span the full observed z-score distribution
+        float minZ = Float.MAX_VALUE;
+        float maxZ = -Float.MAX_VALUE;
+        for (float z : sigZ) {
+            if (!Float.isNaN(z)) {
+                minZ = Math.min(minZ, z);
+                maxZ = Math.max(maxZ, z);
+            }
+        }
+        for (float z : noiseZ) {
+            if (!Float.isNaN(z)) {
+                minZ = Math.min(minZ, z);
+                maxZ = Math.max(maxZ, z);
+            }
+        }
+
+        System.out.printf(Locale.US,
+                "Z-score range: [%.2f, %.2f]  "
+                + "(std devs from '%s' training mean)%n%n",
+                minZ, maxZ, targetLang);
+
+        System.out.printf(Locale.US,
+                "%-12s  %8s  %8s  %8s  %9s  %10s  %10s%n",
+                "Z-thresh", "Prec", "Recall", "F1",
+                "SigKept%", "NoiseDrop%", "FalseDrop%");
+        System.out.println("-".repeat(76));
+
+        float stepSize = (maxZ - minZ) / steps;
+        for (int i = 0; i <= steps; i++) {
+            float threshold = minZ + i * stepSize;
+            printRow(threshold, sigZ, noiseZ, signal.size());
+        }
+    }
+
+    private static void printRow(float threshold,
+                                  float[] sigScores,
+                                  float[] noiseScores,
+                                  int signalSize) {
+        int tp = 0; // noise correctly dropped  (score < threshold)
+        int fn = 0; // noise incorrectly kept   (score >= threshold)
+        int fp = 0; // signal incorrectly dropped
+        int tn = 0; // signal correctly kept
+
+        for (float s : noiseScores) {
+            if (Float.isNaN(s) || s < threshold) {
+                tp++;
+            } else {
+                fn++;
+            }
+        }
+        for (float s : sigScores) {
+            if (Float.isNaN(s) || s < threshold) {
+                fp++;
+            } else {
+                tn++;
+            }
+        }
+
+        double precision = (tp + fp) > 0
+                ? (double) tp / (tp + fp) : 1.0;
+        double recall    = (tp + fn) > 0
+                ? (double) tp / (tp + fn) : 0.0;
+        double f1        = (precision + recall) > 0
+                ? 2 * precision * recall / (precision + recall) : 0.0;
+        double keptPct   = 100.0 * tn / signalSize;
+        double noisePct  = 100.0 * tp / noiseScores.length;
+        double falsePct  = 100.0 * fp / signalSize;
+
+        System.out.printf(Locale.US,
+                "%12.4f  %8.3f  %8.3f  %8.3f  %8.1f%%  %9.1f%%  %9.1f%%%n",
+                threshold, precision, recall, f1,
+                keptPct, noisePct, falsePct);
+    }
+
+    /** Z-scores each sentence under the target language model. */
+    private static float[] zScores(GenerativeLanguageModel model,
+                                    String targetLang,
+                                    List<String> sentences) {
+        float[] result = new float[sentences.size()];
+        for (int i = 0; i < sentences.size(); i++) {
+            result[i] = model.zScore(sentences.get(i), targetLang);
+        }
+        return result;
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java
new file mode 100644
index 0000000000..50dc658bdc
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.BufferedReader;
+import java.io.FileInputStream;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.LinkedHashMap;
+import java.util.Locale;
+import java.util.Map;
+
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+
+/**
+ * Measures how score mean and stddev vary with text length for selected
+ * languages. Used to decide whether z-scores need length normalization
+ * at runtime.
+ *
+ * <p>For each language, truncates training sentences to various character
+ * lengths, scores them, and reports per-bucket (μ, σ, n). If σ follows
+ * 1/√(charLen), a simple correction factor suffices at runtime.
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   java LengthCalibrationReport \
+ *       --model  generative.bin \
+ *       --corpus /path/to/pool_filtered \
+ *       --langs  eng,fra,zho,jpn,ara,kor \
+ *       [--max-per-lang 50000]
+ * </pre>
+ */
+public class LengthCalibrationReport {
+
+    private static final int   DEFAULT_MAX = 50_000;
+    private static final int[] CHAR_LENGTHS = {10, 20, 30, 50, 75, 100, 150, 
200, 500, 99999};
+
+    public static void main(String[] args) throws Exception {
+        Path   modelPath  = null;
+        Path   corpusPath = null;
+        String langsArg   = "eng,fra,zho,jpn,ara";
+        int    max        = DEFAULT_MAX;
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--model":
+                    modelPath = Paths.get(args[++i]);
+                    break;
+                case "--corpus":
+                    corpusPath = Paths.get(args[++i]);
+                    break;
+                case "--langs":
+                    langsArg = args[++i];
+                    break;
+                case "--max-per-lang":
+                    max = Integer.parseInt(args[++i]);
+                    break;
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    System.exit(1);
+            }
+        }
+
+        if (modelPath == null || corpusPath == null) {
+            System.err.println(
+                    "Usage: LengthCalibrationReport --model <bin> --corpus 
<dir> "
+                    + "[--langs eng,fra,zho] [--max-per-lang 50000]");
+            System.exit(1);
+        }
+
+        GenerativeLanguageModel model;
+        try (InputStream is = new FileInputStream(modelPath.toFile())) {
+            model = GenerativeLanguageModel.load(is);
+        }
+
+        String[] langs = langsArg.split(",");
+
+        for (String lang : langs) {
+            lang = lang.trim();
+            if (!model.getLanguages().contains(lang)) {
+                System.err.println("Skipping unknown language: " + lang);
+                continue;
+            }
+
+            Path langFile = corpusPath.resolve(lang);
+            if (!Files.exists(langFile)) {
+                System.err.println("No corpus file for: " + lang);
+                continue;
+            }
+
+            System.out.printf(Locale.US, "%n=== %s ===%n", lang);
+            System.out.printf(Locale.US,
+                    "%-10s  %8s  %10s  %10s  %12s  %12s%n",
+                    "MaxChars", "N", "μ(score)", "σ(score)",
+                    "σ*√(len/50)", "μ(z-full)");
+            System.out.println("-".repeat(70));
+
+            // Read sentences once
+            String[] sentences = readSentences(langFile, max);
+
+            for (int maxLen : CHAR_LENGTHS) {
+                // Welford's online algorithm
+                long   n    = 0;
+                double mean = 0.0;
+                double m2   = 0.0;
+                double zSum = 0.0;
+
+                for (String sentence : sentences) {
+                    String text = sentence.length() > maxLen
+                            ? sentence.substring(0, maxLen) : sentence;
+                    float score = model.score(text, lang);
+                    if (Float.isNaN(score)) {
+                        continue;
+                    }
+                    n++;
+                    double delta = score - mean;
+                    mean += delta / n;
+                    m2   += delta * (score - mean);
+
+                    float z = model.zScore(text, lang);
+                    if (!Float.isNaN(z)) {
+                        zSum += z;
+                    }
+                }
+
+                double stdDev = n > 1 ? Math.sqrt(m2 / (n - 1)) : 0.0;
+                // If σ ~ 1/√len, then σ*√(len/50) should be roughly constant
+                double normalized = stdDev * Math.sqrt((double) 
Math.min(maxLen, 200) / 50.0);
+                double meanZ = n > 0 ? zSum / n : 0.0;
+
+                String label = maxLen >= 99999 ? "full" : 
String.valueOf(maxLen);
+                System.out.printf(Locale.US,
+                        "%-10s  %,8d  %10.4f  %10.4f  %12.4f  %12.4f%n",
+                        label, n, mean, stdDev, normalized, meanZ);
+            }
+        }
+    }
+
+    private static String[] readSentences(Path file, int max) throws Exception 
{
+        Map<Integer, String> lines = new LinkedHashMap<>();
+        try (BufferedReader reader = Files.newBufferedReader(
+                file, StandardCharsets.UTF_8)) {
+            String line;
+            int idx = 0;
+            while ((line = reader.readLine()) != null && idx < max) {
+                String text = line.trim();
+                if (!text.isEmpty()) {
+                    lines.put(idx++, text);
+                }
+            }
+        }
+        return lines.values().toArray(new String[0]);
+    }
+}
diff --git 
a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java
 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java
new file mode 100644
index 0000000000..ec7c6df728
--- /dev/null
+++ 
b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java
@@ -0,0 +1,407 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.tika.langdetect.charsoup.tools;
+
+import java.io.BufferedReader;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.DirectoryStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+import java.util.Locale;
+
+import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel;
+import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor;
+
+/**
+ * Trains a {@link GenerativeLanguageModel} from a Leipzig-format corpus.
+ *
+ * <h3>Corpus format</h3>
+ * <pre>
+ *   corpusDir/
+ *     eng/
+ *       sentences.txt   (lineNum TAB sentence)
+ *     zho/
+ *       sentences.txt
+ *     jpn/
+ *       sentences.txt
+ *     ...
+ * </pre>
+ * Each directory name is used as the language code.  Any {@code .txt} file
+ * directly under a language directory is read; each line must contain at
+ * least one tab, and the text after the first tab is the sentence.
+ *
+ * <h3>CJK detection</h3>
+ * A language is treated as CJK if at least 60% of the letter codepoints
+ * in a random sample of sentences are CJK/kana characters.  You can
+ * override this with an explicit {@code --cjk} list on the command line.
+ *
+ * <h3>Usage</h3>
+ * <pre>
+ *   java TrainGenerativeLanguageModel \
+ *       --corpus  /path/to/Leipzig-corpus \
+ *       --output  generative.bin \
+ *       [--max-per-lang 500000] \
+ *       [--add-k 0.01] \
+ *       [--cjk zho,jpn,cmn]
+ * </pre>
+ */
+public class TrainGenerativeLanguageModel {
+
+    private static final int   DEFAULT_MAX_PER_LANG  = 500_000;
+    private static final float DEFAULT_ADD_K         = 0.01f;
+    /** Fraction of letter codepoints that must be CJK to classify a language 
as CJK. */
+    private static final float CJK_LETTER_THRESHOLD  = 0.60f;
+    /** Number of sentences used to probe the script of an unknown language. */
+    private static final int   CJK_PROBE_SENTENCES   = 500;
+
+    public static void main(String[] args) throws Exception {
+        Path   corpus     = null;
+        Path   output     = null;
+        int    maxPerLang = DEFAULT_MAX_PER_LANG;
+        float  addK       = DEFAULT_ADD_K;
+        List<String> forceCjk = new ArrayList<>();
+
+        for (int i = 0; i < args.length; i++) {
+            switch (args[i]) {
+                case "--corpus":
+                    corpus = Paths.get(args[++i]);
+                    break;
+                case "--output":
+                    output = Paths.get(args[++i]);
+                    break;
+                case "--max-per-lang":
+                    maxPerLang = Integer.parseInt(args[++i]);
+                    break;
+                case "--add-k":
+                    addK = Float.parseFloat(args[++i]);
+                    break;
+                case "--cjk": {
+                    for (String code : args[++i].split(",")) {
+                        forceCjk.add(code.trim());
+                    }
+                    break;
+                }
+                default:
+                    System.err.println("Unknown option: " + args[i]);
+                    printUsage();
+                    System.exit(1);
+            }
+        }
+
+        if (corpus == null || output == null) {
+            printUsage();
+            System.exit(1);
+        }
+
+        new TrainGenerativeLanguageModel().run(corpus, output, maxPerLang, 
addK, forceCjk);
+    }
+
+    private void run(Path corpusDir, Path outputPath,
+                     int maxPerLang, float addK,
+                     List<String> forceCjkList) throws IOException {
+
+        // Support two corpus layouts:
+        //   flat:  corpusDir/{langCode}          (one sentence per line, no 
tab prefix)
+        //   Leipzig: corpusDir/{langCode}/*.txt  (lineNum TAB sentence)
+        boolean flatLayout = isFlatLayout(corpusDir);
+        System.out.printf(Locale.US, "Corpus layout: %s%n", flatLayout ? 
"flat" : "Leipzig");
+
+        List<Path> langPaths = listLangPaths(corpusDir, flatLayout);
+        System.out.printf(Locale.US, "Found %d languages in %s%n", 
langPaths.size(), corpusDir);
+
+        GenerativeLanguageModel.Builder builder = 
GenerativeLanguageModel.builder();
+
+        for (Path langPath : langPaths) {
+            String lang = langPath.getFileName().toString();
+            boolean cjk = forceCjkList.contains(lang)
+                    || probeCjk(langPath, flatLayout, CJK_PROBE_SENTENCES);
+
+            System.out.printf(Locale.US, "  %-12s  %s%n", lang, cjk ? "CJK" : 
"non-CJK");
+            builder.registerLanguage(lang, cjk);
+        }
+
+        System.out.println("Accumulating n-gram counts …");
+        long totalSentences = 0;
+
+        for (Path langPath : langPaths) {
+            String lang    = langPath.getFileName().toString();
+            long   counted = feedLanguage(builder, lang, langPath, flatLayout, 
maxPerLang);
+            totalSentences += counted;
+            System.out.printf(Locale.US, "  %-12s  %,d sentences%n", lang, 
counted);
+        }
+
+        System.out.printf(Locale.US, "Total sentences: %,d%n", totalSentences);
+        System.out.printf(Locale.US, "Building model (add-k=%.4f) …%n", addK);
+
+        GenerativeLanguageModel model = builder.build(addK);
+
+        // Second pass: score training data to compute per-language μ and σ
+        System.out.println("Calibrating z-scores (second pass) …");
+        for (Path langPath : langPaths) {
+            String lang = langPath.getFileName().toString();
+            double[] stats = calibrateLanguage(model, lang, langPath, 
flatLayout, maxPerLang);
+            model.setStats(lang, (float) stats[0], (float) stats[1]);
+            System.out.printf(Locale.US,
+                    "  %-12s  μ=%8.4f  σ=%6.4f  (n=%d)%n",
+                    lang, stats[0], stats[1], (long) stats[2]);
+        }
+
+        System.out.printf(Locale.US, "Writing model to %s …%n", outputPath);
+        try (OutputStream os = new FileOutputStream(outputPath.toFile())) {
+            model.save(os);
+        }
+
+        long bytes = Files.size(outputPath);
+        System.out.printf(Locale.US, "Done. Model size: %,.0f KB%n", bytes / 
1024.0);
+    }
+
+    // ---- Corpus helpers ----
+
+    /**
+     * Returns true if the corpus uses the flat layout (files named by language
+     * code, one sentence per line) rather than the Leipzig layout 
(subdirectories
+     * containing {@code *.txt} files with {@code lineNum TAB sentence} lines).
+     */
+    private static boolean isFlatLayout(Path corpusDir) throws IOException {
+        try (DirectoryStream<Path> stream = 
Files.newDirectoryStream(corpusDir)) {
+            for (Path p : stream) {
+                return Files.isRegularFile(p);
+            }
+        }
+        return true;
+    }
+
+    /**
+     * List all language paths in the corpus directory, sorted.
+     * For flat layout: regular files. For Leipzig layout: subdirectories.
+     */
+    private static List<Path> listLangPaths(Path corpusDir,
+                                             boolean flat) throws IOException {
+        List<Path> paths = new ArrayList<>();
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(corpusDir,
+                p -> flat ? Files.isRegularFile(p) : Files.isDirectory(p))) {
+            for (Path p : stream) {
+                paths.add(p);
+            }
+        }
+        Collections.sort(paths);
+        return paths;
+    }
+
+    /**
+     * Feed up to {@code maxPerLang} sentences from {@code langPath} into the 
builder.
+     *
+     * @return number of sentences consumed
+     */
+    private static long feedLanguage(GenerativeLanguageModel.Builder builder,
+                                     String lang, Path langPath,
+                                     boolean flat,
+                                     int maxPerLang) throws IOException {
+        long count = 0;
+        if (flat) {
+            try (BufferedReader reader = Files.newBufferedReader(langPath,
+                    StandardCharsets.UTF_8)) {
+                String line;
+                while ((line = reader.readLine()) != null) {
+                    String text = line.trim();
+                    if (text.isEmpty()) {
+                        continue;
+                    }
+                    builder.addSample(lang, text);
+                    count++;
+                    if (maxPerLang > 0 && count >= maxPerLang) {
+                        break;
+                    }
+                }
+            }
+        } else {
+            List<Path> files = listTxtFiles(langPath);
+            outer:
+            for (Path file : files) {
+                try (BufferedReader reader = Files.newBufferedReader(file,
+                        StandardCharsets.UTF_8)) {
+                    String line;
+                    while ((line = reader.readLine()) != null) {
+                        int tab = line.indexOf('\t');
+                        if (tab < 0) {
+                            continue;
+                        }
+                        String text = line.substring(tab + 1).trim();
+                        if (text.isEmpty()) {
+                            continue;
+                        }
+                        builder.addSample(lang, text);
+                        count++;
+                        if (maxPerLang > 0 && count >= maxPerLang) {
+                            break outer;
+                        }
+                    }
+                }
+            }
+        }
+        return count;
+    }
+
+    /**
+     * Score every training sentence for {@code lang} against the built model
+     * and return {@code [mean, stdDev, count]} using Welford's online 
algorithm.
+     */
+    private static double[] calibrateLanguage(
+            GenerativeLanguageModel model, String lang,
+            Path langPath, boolean flat, int maxPerLang) throws IOException {
+        long   n    = 0;
+        double mean = 0.0;
+        double m2   = 0.0;
+
+        if (flat) {
+            try (BufferedReader reader = Files.newBufferedReader(
+                    langPath, StandardCharsets.UTF_8)) {
+                String line;
+                while ((line = reader.readLine()) != null) {
+                    String text = line.trim();
+                    if (text.isEmpty()) {
+                        continue;
+                    }
+                    float s = model.score(text, lang);
+                    if (Float.isNaN(s)) {
+                        continue;
+                    }
+                    n++;
+                    double delta = s - mean;
+                    mean += delta / n;
+                    m2   += delta * (s - mean);
+                    if (maxPerLang > 0 && n >= maxPerLang) {
+                        break;
+                    }
+                }
+            }
+        } else {
+            List<Path> files = listTxtFiles(langPath);
+            outer:
+            for (Path file : files) {
+                try (BufferedReader reader = Files.newBufferedReader(
+                        file, StandardCharsets.UTF_8)) {
+                    String line;
+                    while ((line = reader.readLine()) != null) {
+                        int tab = line.indexOf('\t');
+                        if (tab < 0) {
+                            continue;
+                        }
+                        String text = line.substring(tab + 1).trim();
+                        if (text.isEmpty()) {
+                            continue;
+                        }
+                        float s = model.score(text, lang);
+                        if (Float.isNaN(s)) {
+                            continue;
+                        }
+                        n++;
+                        double delta = s - mean;
+                        mean += delta / n;
+                        m2   += delta * (s - mean);
+                        if (maxPerLang > 0 && n >= maxPerLang) {
+                            break outer;
+                        }
+                    }
+                }
+            }
+        }
+
+        double stdDev = n > 1 ? Math.sqrt(m2 / (n - 1)) : 0.0;
+        return new double[]{mean, stdDev, n};
+    }
+
+    /**
+     * Probe a language path to decide whether it is CJK.
+     */
+    private static boolean probeCjk(Path langPath, boolean flat,
+                                     int maxSentences) throws IOException {
+        long cjkLetters   = 0;
+        long totalLetters = 0;
+        int  sentences    = 0;
+
+        List<Path> files = flat
+                ? Collections.singletonList(langPath) : listTxtFiles(langPath);
+
+        outer:
+        for (Path file : files) {
+            try (BufferedReader reader = Files.newBufferedReader(file,
+                    StandardCharsets.UTF_8)) {
+                String line;
+                while ((line = reader.readLine()) != null) {
+                    String text;
+                    if (flat) {
+                        text = line.trim();
+                    } else {
+                        int tab = line.indexOf('\t');
+                        if (tab < 0) continue;
+                        text = line.substring(tab + 1);
+                    }
+                    if (text.isEmpty()) continue;
+                    int i = 0;
+                    while (i < text.length()) {
+                        int cp = text.codePointAt(i);
+                        i += Character.charCount(cp);
+                        if (Character.isLetter(cp)) {
+                            totalLetters++;
+                            if (ScriptAwareFeatureExtractor.isCjkOrKana(
+                                    Character.toLowerCase(cp))) {
+                                cjkLetters++;
+                            }
+                        }
+                    }
+                    sentences++;
+                    if (sentences >= maxSentences) {
+                        break outer;
+                    }
+                }
+            }
+        }
+
+        if (totalLetters == 0) {
+            return false;
+        }
+        return (double) cjkLetters / totalLetters >= CJK_LETTER_THRESHOLD;
+    }
+
+    private static List<Path> listTxtFiles(Path dir) throws IOException {
+        List<Path> files = new ArrayList<>();
+        try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, 
"*.txt")) {
+            for (Path p : stream) {
+                files.add(p);
+            }
+        }
+        Collections.sort(files);
+        return files;
+    }
+
+    private static void printUsage() {
+        System.err.println("Usage: TrainGenerativeLanguageModel");
+        System.err.println("         --corpus <corpusDir>");
+        System.err.println("         --output <outputFile>");
+        System.err.println("         [--max-per-lang <N>]   (default 500000)");
+        System.err.println("         [--add-k <k>]           (default 0.01)");
+        System.err.println("         [--cjk lang1,lang2,...] (override 
auto-detection)");
+    }
+}
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
index 31421a12c9..065dec2cad 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java
@@ -17,6 +17,7 @@
 package org.apache.tika.config;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
+import static org.junit.jupiter.api.Assertions.assertFalse;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
 import static org.junit.jupiter.api.Assertions.assertThrows;
 import static org.junit.jupiter.api.Assertions.assertTrue;
@@ -35,6 +36,7 @@ import org.apache.tika.config.loader.TikaLoader;
 import org.apache.tika.detect.BOMDetector;
 import org.apache.tika.detect.CompositeEncodingDetector;
 import org.apache.tika.detect.EncodingDetector;
+import org.apache.tika.detect.EncodingResult;
 import org.apache.tika.detect.MetaEncodingDetector;
 import org.apache.tika.detect.MetadataCharsetDetector;
 import org.apache.tika.detect.OverrideEncodingDetector;
@@ -46,6 +48,7 @@ import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.ml.chardetect.MojibusterEncodingDetector;
 import org.apache.tika.parser.AbstractEncodingDetectorParser;
 import org.apache.tika.parser.CompositeParser;
+import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.html.HtmlEncodingDetector;
@@ -294,6 +297,33 @@ public class TikaEncodingDetectorTest extends TikaTest {
         }
     }
 
+    // -----------------------------------------------------------------------
+    // Solr integration-test regression (TIKA-4662)
+    // -----------------------------------------------------------------------
+
+    /**
+     * ASCII HTML with an explicit {@code <meta charset="UTF-8">} must be
+     * detected as UTF-8.  The full detection chain is required: the HTML
+     * detector produces a DECLARATIVE UTF-8 result; CharSoupEncodingDetector
+     * sees that both UTF-8 and the statistical winner (windows-1252) decode
+     * the pure-ASCII bytes identically and therefore defers to the 
declaration.
+     */
+    @Test
+    public void testAsciiHtmlWithMetaIsDetectedAsUtf8() throws Exception {
+        byte[] bytes =
+                "<html><head><meta 
charset=\"UTF-8\"></head><body>initial</body></html>"
+                        .getBytes(StandardCharsets.UTF_8);
+        EncodingDetector detector = 
TikaLoader.loadDefault().loadEncodingDetectors();
+        try (TikaInputStream tis = TikaInputStream.get(bytes)) {
+            List<EncodingResult> results =
+                    detector.detect(tis, new Metadata(), new ParseContext());
+            assertFalse(results.isEmpty(), "detector returned no result for 
ASCII HTML with meta");
+            assertEquals(StandardCharsets.UTF_8, results.get(0).getCharset(),
+                    "ASCII HTML with <meta charset=UTF-8> should be detected 
as UTF-8, got: "
+                            + results.get(0).getCharset().name());
+        }
+    }
+
     @Test
     public void testArabicMisleadingCharsetHtml() throws Exception {
         // This HTML file is encoded in windows-1256 but declares charset=UTF-8

(tika) 01/05: wip initial commit

Reply via email to