This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4690-add-generative-models in repository https://gitbox.apache.org/repos/asf/tika.git
commit 975db37821b4701669a747ef6caf5a453f4dd5e8 Author: tballison <[email protected]> AuthorDate: Tue Mar 10 14:38:29 2026 -0400 wip initial commit --- .../ROOT/pages/advanced/charset-detection-eval.txt | 260 ++++++++ .../chardetect/CharsetDetectionRegressionTest.java | 162 +++++ .../charsoup/GenerativeLanguageModel.java | 708 +++++++++++++++++++++ .../charsoup/ScriptAwareFeatureExtractor.java | 2 +- .../charsoup/tools/CorpusDiversityAnalyzer.java | 274 ++++++++ .../charsoup/tools/CorpusFilterReport.java | 253 ++++++++ .../charsoup/tools/EvalGenerativeModel.java | 365 +++++++++++ .../langdetect/charsoup/tools/FilterBenchmark.java | 228 +++++++ .../charsoup/tools/LengthCalibrationReport.java | 171 +++++ .../tools/TrainGenerativeLanguageModel.java | 407 ++++++++++++ .../tika/config/TikaEncodingDetectorTest.java | 30 + 11 files changed, 2859 insertions(+), 1 deletion(-) diff --git a/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt b/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt new file mode 100644 index 0000000000..2a71a78a28 --- /dev/null +++ b/docs/modules/ROOT/pages/advanced/charset-detection-eval.txt @@ -0,0 +1,260 @@ + +=== Probe length: 20B === + N | --- ML ablation --------------------------------- | --- Baselines ----------------------- | +Charset | Stat R% S% | +ISO R% S% | +CJK R% S% | All R% S% | ICU4J R% S% | juniv R% S% | +---------------------------------------------------------------------------------------------------------------------------------- +Big5-HKSCS 5000 | 98.0 98.0 | 97.9 97.9 | 97.9 97.9 | 97.9 97.9 | 0.0 9.8 | 0.0 37.9 | +EUC-JP 5000 | 54.5 54.5 | 54.6 54.6 | 54.7 54.7 | 54.7 54.7 | 0.0 0.0 | 69.7 69.7 | +EUC-KR 5000 | 66.4 66.4 | 67.6 67.6 | 67.6 67.6 | 67.6 67.6 | 0.0 0.0 | 78.3 78.3 | +GB18030 5000 | 29.8 29.8 | 30.7 30.7 | 30.8 30.8 | 30.8 30.8 | 0.1 0.1 | 40.6 40.6 | +IBM1047 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 86.8 | 0.0 0.0 | +IBM420-ltr 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 80.9 | 0.0 0.0 | +IBM420-rtl 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 84.2 | 0.0 0.0 | +IBM424-ltr 6195 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 69.1 | 0.0 0.0 | +IBM424-rtl 4717 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 69.6 | 0.0 0.0 | +IBM437 7516 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM500 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 87.7 87.7 | 0.0 0.0 | +IBM850 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM852 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM855 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 90.7 90.7 | +IBM866 8442 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 43.0 43.0 | 93.2 93.2 | +ISO-2022-CN 5000 | 0.0 0.0 | 94.6 94.6 | 94.6 94.6 | 94.6 94.6 | 87.7 87.7 | 0.0 0.0 | +ISO-2022-JP 5000 | 0.0 0.0 | 94.0 94.0 | 94.0 94.0 | 94.0 94.0 | 77.9 77.9 | 93.9 93.9 | +ISO-2022-KR 5000 | 0.0 0.0 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 92.6 92.6 | 94.4 94.4 | +ISO-8859-16 10000 | 38.3 38.3 | 31.6 31.6 | 31.7 31.7 | 31.7 31.7 | 0.0 0.0 | 0.0 0.0 | +ISO-8859-3 5195 | 10.0 10.0 | 10.3 10.3 | 10.5 10.5 | 10.5 10.5 | 0.0 0.0 | 0.0 0.0 | +KOI8-R 8411 | 57.0 66.6 | 57.0 66.6 | 57.1 66.7 | 57.1 66.7 | 59.7 59.7 | 94.6 94.6 | +KOI8-U 5921 | 71.2 79.3 | 71.3 79.5 | 71.3 79.6 | 71.3 79.6 | 0.0 56.6 | 0.0 95.4 | +Shift_JIS 5000 | 84.3 84.3 | 86.6 86.6 | 86.6 86.6 | 86.6 86.6 | 0.0 0.0 | 71.6 71.6 | +US-ASCII 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 2.3 | 0.0 0.0 | +UTF-16-BE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 68.7 68.7 | 0.0 0.0 | +UTF-16-LE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 69.4 69.4 | 0.0 0.0 | +UTF-32-BE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 100.0 100.0 | 0.0 0.0 | +UTF-32-LE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 100.0 100.0 | 0.0 0.0 | +UTF-8 5000 | 61.4 61.4 | 61.1 61.1 | 61.5 61.5 | 61.5 61.5 | 77.6 77.6 | 78.2 78.2 | +windows-1250 10000 | 4.7 4.7 | 4.8 4.8 | 4.9 4.9 | 4.9 4.9 | 9.3 48.3 | 0.0 0.0 | +windows-1251 10000 | 82.4 82.4 | 82.5 82.5 | 82.6 82.6 | 82.6 82.6 | 56.1 56.4 | 71.4 71.5 | +windows-1252 10000 | 2.6 2.6 | 50.0 50.0 | 50.1 50.1 | 50.1 50.1 | 4.5 70.0 | 0.0 98.7 | +windows-1253 10000 | 55.2 55.2 | 55.4 55.4 | 55.5 55.5 | 55.5 55.5 | 1.9 63.3 | 0.1 80.8 | +windows-1254 10000 | 35.8 35.8 | 35.8 35.8 | 35.9 35.9 | 35.9 35.9 | 4.9 52.1 | 0.0 0.0 | +windows-1255 10000 | 86.7 86.7 | 86.7 86.7 | 86.7 86.7 | 86.7 86.7 | 3.5 27.2 | 93.9 96.2 | +windows-1256 10000 | 94.1 94.1 | 94.2 94.2 | 94.2 94.2 | 94.2 94.2 | 70.2 78.6 | 0.0 0.0 | +windows-1257 10000 | 18.8 18.8 | 17.0 17.0 | 17.1 17.1 | 17.1 17.1 | 0.0 0.0 | 0.0 0.0 | +windows-1258 10000 | 63.0 63.0 | 63.2 63.2 | 63.3 63.3 | 63.3 63.3 | 0.0 0.0 | 0.0 0.0 | +windows-874 10000 | 79.3 79.3 | 80.6 80.6 | 81.4 81.4 | 81.4 81.4 | 0.0 0.0 | 0.0 0.0 | +x-EUC-TW 5000 | 99.4 99.4 | 98.3 98.3 | 98.3 98.3 | 98.3 98.3 | 0.0 0.0 | 38.7 38.7 | +x-ISO-2022-CN-CNS 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +x-MacRoman 10000 | 6.1 6.1 | 6.2 6.2 | 6.3 6.3 | 6.3 6.3 | 0.0 0.0 | 0.0 0.0 | +x-mac-cyrillic 10000 | 68.8 68.8 | 69.2 69.2 | 69.3 69.3 | 69.3 69.3 | 0.0 0.0 | 49.3 49.3 | +------------------------------------------------------------------------------------------------------------------------ +OVERALL 326397 | 30.0 30.4 | 35.6 36.0 | 35.7 36.1 | 35.7 36.1 | 20.3 39.0 | 22.9 30.7 | + Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | R%=strict | S%=soft + µs/sample | 10.0 | 7.3 | 7.4 | 7.0 | 11.9 | 3.3 | + +=== Probe length: 50B === + N | --- ML ablation --------------------------------- | --- Baselines ----------------------- | +Charset | Stat R% S% | +ISO R% S% | +CJK R% S% | All R% S% | ICU4J R% S% | juniv R% S% | +---------------------------------------------------------------------------------------------------------------------------------- +Big5-HKSCS 5000 | 99.4 99.4 | 99.3 99.3 | 99.3 99.3 | 99.3 99.3 | 0.0 95.9 | 0.0 69.1 | +EUC-JP 5000 | 88.9 88.9 | 89.0 89.0 | 89.0 89.0 | 89.0 89.0 | 86.7 86.7 | 90.2 90.2 | +EUC-KR 5000 | 92.7 92.7 | 93.3 93.3 | 93.3 93.3 | 93.3 93.3 | 89.4 89.4 | 96.0 96.0 | +GB18030 5000 | 65.0 65.0 | 66.4 66.4 | 66.5 66.5 | 66.5 66.5 | 71.8 71.8 | 83.2 83.2 | +IBM1047 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 80.2 | 0.0 0.0 | +IBM420-ltr 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 93.1 | 0.0 0.0 | +IBM420-rtl 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 94.7 | 0.0 0.0 | +IBM424-ltr 6195 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 86.1 | 0.0 0.0 | +IBM424-rtl 4717 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 87.6 | 0.0 0.0 | +IBM437 7516 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM500 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 80.2 80.2 | 0.0 0.0 | +IBM850 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM852 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +IBM855 10000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 97.0 97.0 | +IBM866 8442 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 62.0 62.0 | 98.1 98.1 | +ISO-2022-CN 5000 | 0.0 0.0 | 98.6 98.6 | 98.6 98.6 | 98.6 98.6 | 98.5 98.5 | 0.0 0.0 | +ISO-2022-JP 5000 | 0.0 0.0 | 97.4 97.4 | 97.4 97.4 | 97.4 97.4 | 93.9 93.9 | 97.3 97.3 | +ISO-2022-KR 5000 | 0.0 0.0 | 98.9 98.9 | 98.9 98.9 | 98.9 98.9 | 98.8 98.8 | 98.9 98.9 | +ISO-8859-16 10000 | 89.3 89.3 | 80.5 80.5 | 80.7 80.7 | 80.7 80.7 | 0.0 0.0 | 0.0 0.0 | +ISO-8859-3 5195 | 50.8 50.8 | 49.8 49.8 | 49.9 49.9 | 49.9 49.9 | 0.0 0.0 | 0.0 0.0 | +KOI8-R 8411 | 86.2 93.6 | 86.2 93.7 | 86.2 93.7 | 86.2 93.7 | 77.6 77.6 | 98.6 98.6 | +KOI8-U 5921 | 92.7 97.4 | 92.7 97.4 | 92.7 97.5 | 92.7 97.5 | 0.0 74.8 | 0.0 98.2 | +Shift_JIS 5000 | 95.3 95.3 | 96.1 96.1 | 96.1 96.1 | 96.1 96.1 | 88.5 88.5 | 94.5 94.5 | +US-ASCII 5000 | 0.0 0.5 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.3 | 0.0 0.0 | +UTF-16-BE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 85.7 85.7 | 0.0 0.0 | +UTF-16-LE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 87.6 87.6 | 0.0 0.0 | +UTF-32-BE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 100.0 100.0 | 0.0 0.0 | +UTF-32-LE 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 100.0 100.0 | 0.0 0.0 | +UTF-8 5000 | 84.4 84.4 | 88.3 88.3 | 88.5 88.5 | 88.5 88.5 | 92.4 92.4 | 93.3 93.3 | +windows-1250 10000 | 31.3 31.3 | 32.3 32.3 | 32.9 32.9 | 32.9 32.9 | 22.3 56.9 | 0.0 0.0 | +windows-1251 10000 | 95.8 95.8 | 95.8 95.8 | 95.9 95.9 | 95.9 95.9 | 76.0 76.1 | 80.9 80.9 | +windows-1252 10000 | 20.8 20.8 | 39.1 39.1 | 39.4 39.4 | 39.4 39.4 | 12.8 86.6 | 0.0 99.1 | +windows-1253 10000 | 90.2 90.2 | 90.3 90.3 | 90.3 90.3 | 90.3 90.3 | 8.2 86.8 | 0.1 91.5 | +windows-1254 10000 | 80.1 80.1 | 80.0 80.0 | 80.0 80.0 | 80.0 80.0 | 12.3 73.1 | 0.0 0.0 | +windows-1255 10000 | 97.8 97.8 | 97.8 97.8 | 97.8 97.8 | 97.8 97.8 | 9.4 40.2 | 98.1 99.4 | +windows-1256 10000 | 99.1 99.1 | 99.2 99.2 | 99.2 99.2 | 99.2 99.2 | 88.1 91.5 | 0.0 0.0 | +windows-1257 10000 | 72.0 72.0 | 67.2 67.2 | 67.3 67.3 | 67.3 67.3 | 0.0 0.0 | 0.0 0.0 | +windows-1258 10000 | 96.1 96.1 | 96.2 96.2 | 96.3 96.3 | 96.3 96.3 | 0.0 0.0 | 0.0 0.0 | +windows-874 10000 | 95.3 95.3 | 95.8 95.8 | 96.0 96.0 | 96.0 96.0 | 0.0 0.0 | 0.0 0.0 | +x-EUC-TW 5000 | 99.5 99.5 | 99.3 99.3 | 99.3 99.3 | 99.3 99.3 | 0.0 0.0 | 67.1 67.1 | +x-ISO-2022-CN-CNS 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +x-MacRoman 10000 | 58.5 58.5 | 56.4 56.4 | 56.6 56.6 | 56.6 56.6 | 0.0 0.0 | 0.0 0.0 | +x-mac-cyrillic 10000 | 81.0 81.0 | 81.2 81.2 | 81.2 81.2 | 81.2 81.2 | 0.0 0.0 | 62.3 62.3 | +------------------------------------------------------------------------------------------------------------------------ +OVERALL 326397 | 45.2 45.4 | 49.9 50.2 | 49.9 50.2 | 49.9 50.2 | 29.8 52.4 | 26.5 35.2 | + Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | R%=strict | S%=soft + µs/sample | 10.7 | 8.2 | 8.1 | 8.1 | 21.6 | 4.1 | + +=== Probe length: 100B === + N | --- ML ablation --------------------------------- | --- Baselines ----------------------- | +Charset | Stat R% S% | +ISO R% S% | +CJK R% S% | All R% S% | ICU4J R% S% | juniv R% S% | +---------------------------------------------------------------------------------------------------------------------------------- +Big5-HKSCS 5000 | 99.7 99.7 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 0.0 97.6 | 0.0 69.7 | +EUC-JP 5000 | 95.8 95.8 | 95.8 95.8 | 95.8 95.8 | 95.8 95.8 | 97.0 97.0 | 95.7 95.7 | +EUC-KR 5000 | 95.3 95.3 | 95.4 95.4 | 95.4 95.4 | 95.4 95.4 | 98.4 98.4 | 99.1 99.1 | +GB18030 5000 | 85.1 85.1 | 85.9 85.9 | 85.9 85.9 | 85.9 85.9 | 95.9 95.9 | 97.9 97.9 | +IBM1047 10000 | 49.5 94.7 | 49.6 94.9 | 49.6 94.9 | 49.6 94.9 | 0.0 83.2 | 0.0 0.0 | +IBM420-ltr 10000 | 92.6 95.0 | 92.6 95.0 | 92.6 95.0 | 92.6 95.0 | 0.0 95.0 | 0.0 0.0 | +IBM420-rtl 10000 | 94.1 94.6 | 94.1 94.6 | 94.1 94.6 | 94.1 94.6 | 0.0 96.3 | 0.0 0.0 | +IBM424-ltr 6195 | 91.6 91.8 | 74.8 75.0 | 74.8 75.0 | 74.8 75.0 | 0.0 92.4 | 0.0 0.0 | +IBM424-rtl 4717 | 93.0 95.1 | 74.2 75.5 | 74.2 75.5 | 74.2 75.5 | 0.0 89.4 | 0.0 0.0 | +IBM437 7516 | 0.0 79.0 | 0.0 77.8 | 0.0 77.8 | 0.0 77.8 | 0.0 0.0 | 0.0 0.0 | +IBM500 10000 | 56.0 94.9 | 56.1 95.1 | 56.1 95.1 | 56.1 95.1 | 83.5 83.5 | 0.0 0.0 | +IBM850 10000 | 79.8 79.8 | 79.5 79.5 | 79.5 79.5 | 79.5 79.5 | 0.0 0.0 | 0.0 0.0 | +IBM852 10000 | 77.4 77.4 | 78.1 78.1 | 78.1 78.1 | 78.1 78.1 | 0.0 0.0 | 0.0 0.0 | +IBM855 10000 | 94.6 94.6 | 94.7 94.7 | 94.7 94.7 | 94.7 94.7 | 0.0 0.0 | 99.0 99.0 | +IBM866 8442 | 95.9 95.9 | 96.0 96.0 | 96.0 96.0 | 96.0 96.0 | 78.8 78.8 | 99.3 99.3 | +ISO-2022-CN 5000 | 0.0 0.0 | 99.1 99.1 | 99.1 99.1 | 99.1 99.1 | 99.0 99.0 | 0.0 0.0 | +ISO-2022-JP 5000 | 0.0 0.0 | 99.1 99.1 | 99.1 99.1 | 99.1 99.1 | 98.8 98.8 | 99.1 99.1 | +ISO-2022-KR 5000 | 0.0 0.0 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | +ISO-8859-16 10000 | 92.0 92.0 | 89.1 89.1 | 89.1 89.1 | 89.1 89.1 | 0.0 0.0 | 0.0 0.0 | +ISO-8859-3 5195 | 78.4 78.4 | 78.5 78.5 | 78.5 78.5 | 78.5 78.5 | 0.0 0.0 | 0.0 0.0 | +KOI8-R 8411 | 95.7 99.0 | 95.7 99.0 | 95.7 99.0 | 95.7 99.0 | 90.5 90.5 | 99.4 99.4 | +KOI8-U 5921 | 96.9 99.5 | 96.9 99.5 | 96.9 99.5 | 96.9 99.5 | 0.0 85.4 | 0.0 98.9 | +Shift_JIS 5000 | 97.7 97.7 | 98.0 98.0 | 98.0 98.0 | 98.0 98.0 | 97.8 97.8 | 98.7 98.7 | +US-ASCII 5000 | 0.0 0.8 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +UTF-16-BE 5000 | 94.0 94.1 | 93.9 93.9 | 93.9 93.9 | 93.9 93.9 | 85.5 85.5 | 0.0 0.0 | +UTF-16-LE 5000 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 87.0 87.0 | 0.0 0.0 | +UTF-32-BE 5000 | 94.9 94.9 | 94.7 94.7 | 94.7 94.7 | 94.7 94.7 | 100.0 100.0 | 0.0 0.0 | +UTF-32-LE 5000 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 100.0 100.0 | 0.0 0.0 | +UTF-8 5000 | 90.6 90.6 | 93.2 93.2 | 93.2 93.2 | 93.2 93.2 | 97.3 97.3 | 97.2 97.2 | +windows-1250 10000 | 55.3 55.3 | 56.4 56.4 | 56.5 56.5 | 56.5 56.5 | 34.4 62.5 | 0.0 0.0 | +windows-1251 10000 | 98.8 98.8 | 98.8 98.8 | 98.8 98.8 | 98.8 98.8 | 85.7 85.7 | 86.2 86.2 | +windows-1252 10000 | 34.4 34.4 | 40.0 40.0 | 40.4 40.4 | 40.4 40.4 | 21.3 91.4 | 0.0 99.4 | +windows-1253 10000 | 97.1 97.1 | 97.1 97.1 | 97.1 97.1 | 97.1 97.1 | 15.4 93.7 | 0.2 95.3 | +windows-1254 10000 | 93.1 93.1 | 93.0 93.0 | 93.0 93.0 | 93.0 93.0 | 21.2 83.8 | 0.0 0.0 | +windows-1255 10000 | 99.0 99.0 | 99.0 99.0 | 99.0 99.0 | 99.0 99.0 | 14.6 46.8 | 99.0 99.9 | +windows-1256 10000 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 94.6 96.1 | 0.0 0.0 | +windows-1257 10000 | 87.1 87.1 | 85.7 85.7 | 85.7 85.7 | 85.7 85.7 | 0.0 0.0 | 0.0 0.0 | +windows-1258 10000 | 97.0 97.0 | 97.0 97.0 | 97.0 97.0 | 97.0 97.0 | 0.0 0.0 | 0.0 0.0 | +windows-874 10000 | 97.7 97.7 | 97.7 97.7 | 97.7 97.7 | 97.7 97.7 | 0.0 0.0 | 0.0 0.0 | +x-EUC-TW 5000 | 99.7 99.7 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 0.0 0.0 | 67.7 67.7 | +x-ISO-2022-CN-CNS 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +x-MacRoman 10000 | 80.0 80.0 | 79.2 79.2 | 79.2 79.2 | 79.2 79.2 | 0.0 0.0 | 0.0 0.0 | +x-mac-cyrillic 10000 | 88.6 88.6 | 88.7 88.7 | 88.7 88.7 | 88.7 88.7 | 0.0 0.0 | 70.1 70.1 | +------------------------------------------------------------------------------------------------------------------------ +OVERALL 326397 | 78.0 82.6 | 82.1 86.7 | 82.1 86.7 | 82.1 86.7 | 33.4 56.3 | 27.6 36.4 | + Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | R%=strict | S%=soft + µs/sample | 10.8 | 8.2 | 8.0 | 7.9 | 36.3 | 5.7 | + +=== Probe length: 200B === + N | --- ML ablation --------------------------------- | --- Baselines ----------------------- | +Charset | Stat R% S% | +ISO R% S% | +CJK R% S% | All R% S% | ICU4J R% S% | juniv R% S% | +---------------------------------------------------------------------------------------------------------------------------------- +Big5-HKSCS 5000 | 99.8 99.8 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 0.0 97.8 | 0.0 69.7 | +EUC-JP 5000 | 98.4 98.4 | 98.4 98.4 | 98.4 98.4 | 98.4 98.4 | 99.2 99.2 | 97.9 97.9 | +EUC-KR 5000 | 99.1 99.1 | 99.1 99.1 | 99.1 99.1 | 99.1 99.1 | 99.8 99.8 | 99.8 99.8 | +GB18030 5000 | 94.0 94.0 | 94.5 94.5 | 94.5 94.5 | 94.5 94.5 | 98.4 98.4 | 99.4 99.4 | +IBM1047 10000 | 52.7 95.7 | 52.8 95.8 | 52.8 95.8 | 52.8 95.8 | 0.0 86.8 | 0.0 0.0 | +IBM420-ltr 10000 | 95.3 96.1 | 95.3 96.1 | 95.3 96.1 | 95.3 96.1 | 0.0 96.8 | 0.0 0.0 | +IBM420-rtl 10000 | 95.3 95.4 | 95.3 95.4 | 95.3 95.4 | 95.3 95.4 | 0.0 97.3 | 0.0 0.0 | +IBM424-ltr 6195 | 94.9 94.9 | 87.8 87.8 | 87.8 87.8 | 87.8 87.8 | 0.0 93.4 | 0.0 0.0 | +IBM424-rtl 4717 | 95.6 96.0 | 86.7 87.2 | 86.7 87.2 | 86.7 87.2 | 0.0 87.3 | 0.0 0.0 | +IBM437 7516 | 0.0 92.1 | 0.0 92.0 | 0.0 92.0 | 0.0 92.0 | 0.0 0.0 | 0.0 0.0 | +IBM500 10000 | 56.0 96.0 | 56.0 96.0 | 56.0 96.0 | 56.0 96.0 | 86.8 86.8 | 0.0 0.0 | +IBM850 10000 | 92.2 92.2 | 92.3 92.3 | 92.3 92.3 | 92.3 92.3 | 0.0 0.0 | 0.0 0.0 | +IBM852 10000 | 90.0 90.0 | 90.7 90.7 | 90.7 90.7 | 90.7 90.7 | 0.0 0.0 | 0.0 0.0 | +IBM855 10000 | 95.7 95.7 | 95.7 95.7 | 95.7 95.7 | 95.7 95.7 | 0.0 0.0 | 99.8 99.8 | +IBM866 8442 | 96.2 96.2 | 96.2 96.2 | 96.2 96.2 | 96.2 96.2 | 89.5 89.5 | 99.7 99.7 | +ISO-2022-CN 5000 | 0.0 0.0 | 99.5 99.5 | 99.5 99.5 | 99.5 99.5 | 99.4 99.4 | 0.0 0.0 | +ISO-2022-JP 5000 | 0.0 0.0 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.4 99.4 | 99.6 99.6 | +ISO-2022-KR 5000 | 0.0 0.0 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | +ISO-8859-16 10000 | 95.3 95.3 | 94.9 94.9 | 94.9 94.9 | 94.9 94.9 | 0.0 0.0 | 0.0 0.0 | +ISO-8859-3 5195 | 93.9 93.9 | 94.3 94.3 | 94.3 94.3 | 94.3 94.3 | 0.0 0.0 | 0.0 0.0 | +KOI8-R 8411 | 98.6 99.8 | 98.6 99.8 | 98.6 99.8 | 98.6 99.8 | 95.8 95.8 | 99.7 99.7 | +KOI8-U 5921 | 98.9 99.9 | 98.9 99.9 | 98.9 99.9 | 98.9 99.9 | 0.0 92.5 | 0.0 99.3 | +Shift_JIS 5000 | 99.3 99.3 | 99.4 99.4 | 99.4 99.4 | 99.4 99.4 | 99.7 99.7 | 99.7 99.7 | +US-ASCII 5000 | 0.0 0.5 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +UTF-16-BE 5000 | 94.5 94.5 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 84.9 84.9 | 0.0 0.0 | +UTF-16-LE 5000 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 94.4 94.4 | 86.8 86.8 | 0.0 0.0 | +UTF-32-BE 5000 | 95.2 95.2 | 95.0 95.0 | 95.0 95.0 | 95.0 95.0 | 100.0 100.0 | 0.0 0.0 | +UTF-32-LE 5000 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 100.0 100.0 | 0.0 0.0 | +UTF-8 5000 | 96.1 96.1 | 96.7 96.7 | 96.7 96.7 | 96.7 96.7 | 99.5 99.5 | 98.1 98.1 | +windows-1250 10000 | 73.6 73.6 | 74.3 74.3 | 74.4 74.4 | 74.4 74.4 | 49.3 70.3 | 0.0 0.0 | +windows-1251 10000 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 92.6 92.6 | 89.1 89.1 | +windows-1252 10000 | 47.7 47.7 | 48.5 48.5 | 48.7 48.7 | 48.7 48.7 | 28.5 94.0 | 0.0 99.5 | +windows-1253 10000 | 99.4 99.4 | 99.4 99.4 | 99.4 99.4 | 99.4 99.4 | 22.0 97.2 | 0.3 97.6 | +windows-1254 10000 | 98.4 98.4 | 98.4 98.4 | 98.4 98.4 | 98.4 98.4 | 33.4 93.5 | 0.0 0.0 | +windows-1255 10000 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 20.2 57.9 | 99.7 99.9 | +windows-1256 10000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 97.1 98.2 | 0.0 0.0 | +windows-1257 10000 | 95.1 95.1 | 95.1 95.1 | 95.1 95.1 | 95.1 95.1 | 0.0 0.0 | 0.0 0.0 | +windows-1258 10000 | 99.0 99.0 | 99.0 99.0 | 99.0 99.0 | 99.0 99.0 | 0.0 0.0 | 0.0 0.0 | +windows-874 10000 | 99.4 99.4 | 99.4 99.4 | 99.4 99.4 | 99.4 99.4 | 0.0 0.0 | 0.0 0.0 | +x-EUC-TW 5000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 0.0 0.0 | 67.7 67.7 | +x-ISO-2022-CN-CNS 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +x-MacRoman 10000 | 93.5 93.5 | 93.5 93.5 | 93.5 93.5 | 93.5 93.5 | 0.0 0.0 | 0.0 0.0 | +x-mac-cyrillic 10000 | 93.5 93.5 | 93.5 93.5 | 93.5 93.5 | 93.5 93.5 | 0.0 0.0 | 74.9 74.9 | +------------------------------------------------------------------------------------------------------------------------ +OVERALL 326397 | 82.1 86.9 | 86.5 91.2 | 86.5 91.3 | 86.5 91.3 | 35.8 58.6 | 28.0 36.9 | + Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | R%=strict | S%=soft + µs/sample | 12.4 | 9.2 | 8.7 | 8.9 | 60.7 | 8.2 | + +=== Probe length: full === + N | --- ML ablation --------------------------------- | --- Baselines ----------------------- | +Charset | Stat R% S% | +ISO R% S% | +CJK R% S% | All R% S% | ICU4J R% S% | juniv R% S% | +---------------------------------------------------------------------------------------------------------------------------------- +Big5-HKSCS 5000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 0.0 97.8 | 0.0 69.7 | +EUC-JP 5000 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.9 99.9 | 99.0 99.0 | +EUC-KR 5000 | 99.7 99.7 | 99.7 99.7 | 99.7 99.7 | 99.7 99.7 | 100.0 100.0 | 99.9 99.9 | +GB18030 5000 | 97.1 97.1 | 97.5 97.5 | 97.5 97.5 | 97.5 97.5 | 99.4 99.4 | 99.7 99.7 | +IBM1047 10000 | 59.1 96.0 | 59.1 96.0 | 59.1 96.0 | 59.1 96.0 | 0.0 90.6 | 0.0 0.0 | +IBM420-ltr 10000 | 96.0 96.2 | 96.0 96.2 | 96.0 96.2 | 96.0 96.2 | 0.0 97.8 | 0.0 0.0 | +IBM420-rtl 10000 | 95.6 95.7 | 95.6 95.7 | 95.6 95.7 | 95.6 95.7 | 0.0 98.1 | 0.0 0.0 | +IBM424-ltr 6195 | 95.6 95.6 | 95.5 95.6 | 95.5 95.6 | 95.5 95.6 | 0.0 93.1 | 0.0 0.0 | +IBM424-rtl 4717 | 96.2 96.3 | 96.2 96.3 | 96.2 96.3 | 96.2 96.3 | 0.0 87.3 | 0.0 0.0 | +IBM437 7516 | 0.0 95.9 | 0.0 96.0 | 0.0 96.0 | 0.0 96.0 | 0.0 0.0 | 0.0 0.0 | +IBM500 10000 | 52.6 96.3 | 52.6 96.3 | 52.6 96.3 | 52.6 96.3 | 90.6 90.6 | 0.0 0.0 | +IBM850 10000 | 95.8 95.8 | 95.9 95.9 | 95.9 95.9 | 95.9 95.9 | 0.0 0.0 | 0.0 0.0 | +IBM852 10000 | 95.2 95.2 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 0.0 0.0 | 0.0 0.0 | +IBM855 10000 | 95.9 95.9 | 95.9 95.9 | 95.9 95.9 | 95.9 95.9 | 0.0 0.0 | 99.9 99.9 | +IBM866 8442 | 96.2 96.2 | 96.2 96.2 | 96.2 96.2 | 96.2 96.2 | 95.8 95.8 | 99.8 99.8 | +ISO-2022-CN 5000 | 0.0 0.0 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.7 99.7 | 0.0 0.0 | +ISO-2022-JP 5000 | 0.0 0.0 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.7 99.7 | 99.8 99.8 | +ISO-2022-KR 5000 | 0.0 0.0 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | +ISO-8859-16 10000 | 98.5 98.5 | 98.5 98.5 | 98.5 98.5 | 98.5 98.5 | 0.0 0.0 | 0.0 0.0 | +ISO-8859-3 5195 | 98.4 98.4 | 98.7 98.7 | 98.7 98.7 | 98.7 98.7 | 0.0 0.0 | 0.0 0.0 | +KOI8-R 8411 | 99.4 99.8 | 99.4 99.8 | 99.4 99.8 | 99.4 99.8 | 98.0 98.0 | 99.8 99.8 | +KOI8-U 5921 | 99.5 99.9 | 99.5 99.9 | 99.5 99.9 | 99.5 99.9 | 0.0 96.5 | 0.0 99.8 | +Shift_JIS 5000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 100.0 100.0 | 99.9 99.9 | +US-ASCII 5000 | 0.0 0.2 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +UTF-16-BE 5000 | 94.7 94.7 | 94.7 94.7 | 94.7 94.7 | 94.7 94.7 | 84.6 84.6 | 0.0 0.0 | +UTF-16-LE 5000 | 94.5 94.5 | 94.5 94.5 | 94.5 94.5 | 94.5 94.5 | 86.6 86.6 | 0.0 0.0 | +UTF-32-BE 5000 | 95.2 95.2 | 95.2 95.2 | 95.2 95.2 | 95.2 95.2 | 100.0 100.0 | 0.0 0.0 | +UTF-32-LE 5000 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 95.5 95.5 | 100.0 100.0 | 0.0 0.0 | +UTF-8 5000 | 98.8 98.8 | 98.0 98.0 | 98.1 98.1 | 98.1 98.1 | 99.9 99.9 | 97.7 97.7 | +windows-1250 10000 | 83.5 83.5 | 83.7 83.7 | 83.8 83.8 | 83.8 83.8 | 65.4 78.8 | 0.0 0.0 | +windows-1251 10000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 96.4 96.4 | 90.7 90.7 | +windows-1252 10000 | 63.8 63.8 | 64.0 64.0 | 64.0 64.0 | 64.0 64.0 | 41.4 96.2 | 0.0 99.6 | +windows-1253 10000 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 33.1 98.9 | 0.4 99.0 | +windows-1254 10000 | 99.3 99.3 | 99.3 99.3 | 99.3 99.3 | 99.3 99.3 | 48.7 97.6 | 0.0 0.0 | +windows-1255 10000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 30.4 71.6 | 99.9 100.0 | +windows-1256 10000 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 99.9 99.9 | 98.1 99.0 | 0.0 0.0 | +windows-1257 10000 | 98.5 98.5 | 98.6 98.6 | 98.6 98.6 | 98.6 98.6 | 0.0 0.0 | 0.0 0.0 | +windows-1258 10000 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 99.6 99.6 | 0.0 0.0 | 0.0 0.0 | +windows-874 10000 | 99.7 99.7 | 99.8 99.8 | 99.8 99.8 | 99.8 99.8 | 0.0 0.0 | 0.0 0.0 | +x-EUC-TW 5000 | 99.9 99.9 | 100.0 100.0 | 100.0 100.0 | 100.0 100.0 | 0.0 0.0 | 67.7 67.7 | +x-ISO-2022-CN-CNS 5000 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | 0.0 0.0 | +x-MacRoman 10000 | 98.5 98.5 | 98.6 98.6 | 98.7 98.7 | 98.7 98.7 | 0.0 0.0 | 0.0 0.0 | +x-mac-cyrillic 10000 | 96.9 96.9 | 96.9 96.9 | 96.9 96.9 | 96.9 96.9 | 0.0 0.0 | 75.0 75.0 | +------------------------------------------------------------------------------------------------------------------------ +OVERALL 326397 | 84.1 88.8 | 88.7 93.4 | 88.7 93.4 | 88.7 93.4 | 38.4 60.3 | 28.1 37.0 | + Stat=model only | +ISO=+C1-correction | +CJK=+grammar | All=ML+rules | R%=strict | S%=soft + µs/sample | 16.4 | 11.5 | 10.8 | 11.4 | 152.6 | 18.0 | diff --git a/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java new file mode 100644 index 0000000000..1eb5f2b815 --- /dev/null +++ b/tika-encoding-detectors/tika-encoding-detector-mojibuster/src/test/java/org/apache/tika/ml/chardetect/CharsetDetectionRegressionTest.java @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.ml.chardetect; + +import static java.nio.charset.StandardCharsets.UTF_8; +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; + +import java.nio.charset.Charset; +import java.util.List; + +import org.junit.jupiter.api.Test; + +import org.apache.tika.detect.DefaultEncodingDetector; +import org.apache.tika.detect.EncodingResult; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.ParseContext; + +/** + * Regression tests for charset detection edge-cases that surfaced during + * integration testing with the CharSoup language-aware detector. + * + * <ul> + * <li><b>ASCII-only HTML</b> (Solr integration test regression): simple + * {@code <html><body>…</body></html>} content written as UTF-8 was + * returned as {@code ISO-8859-1} by the old detector chain. + * The correct answer is {@code UTF-8}.</li> + * <li><b>Short plain-text English</b> (TXTParserTest regression): a short + * English paragraph whose bytes are all in the ASCII range was returned + * as {@code ISO-8859-1} and in some cases as {@code UTF-16}. + * The ML-based chain must not return UTF-16 for ASCII-range input.</li> + * <li><b>Shift-JIS ZIP entry name</b>: 9 raw bytes encoding {@code 文章1.txt} + * in Shift-JIS must be detected as {@code Shift_JIS}, not Big5-HKSCS. + * The raw ML logits favour Big5-HKSCS; the CharSoup language signal must + * override the model ranking.</li> + * </ul> + */ +public class CharsetDetectionRegressionTest { + + // 文章1.txt in Shift-JIS (9 raw bytes from a real zip entry) + private static final byte[] SJIS_RAW = hexToBytes("95b68fcd312e747874"); + + // Pure-ASCII HTML without a meta charset declaration — mirrors what the + // Solr integration test wrote before the meta-tag workaround was added. + // The old detector returned ISO-8859-1 for this without any meta tag. + // The new detector required adding <meta charset="UTF-8"> to avoid + // returning an unexpected charset. + private static final byte[] ASCII_HTML_NO_META = + "<html><body>initial</body></html>".getBytes(UTF_8); + + // English plain text from TXTParserTest — all bytes in the ASCII range + private static final byte[] ENGLISH_TEXT = + ("Hello, World! This is simple UTF-8 text content written" + + " in English to test autodetection of both the character" + + " encoding and the language of the input stream.").getBytes(UTF_8); + + // ----------------------------------------------------------------------- + // Solr integration-test regression + // ----------------------------------------------------------------------- + + /** + * ASCII HTML <em>without</em> a meta charset declaration must not be + * returned as UTF-16. + * + * <p>The old detector returned {@code ISO-8859-1} here without requiring + * any meta tag. The new detector regressed: without a meta tag it started + * returning an unexpected charset, which caused the Solr integration test + * to fail. The workaround was to add {@code <meta charset="UTF-8">} to + * the generated HTML — but we should not need to do that. UTF-8, + * US-ASCII, and ISO-8859-1 are all acceptable; UTF-16 is not.</p> + */ + @Test + public void asciiHtmlWithoutMetaIsNotDetectedAsUtf16() throws Exception { + DefaultEncodingDetector detector = new DefaultEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get(ASCII_HTML_NO_META)) { + List<EncodingResult> results = + detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), "detector returned no result for ASCII HTML"); + Charset top = results.get(0).getCharset(); + assertFalse(top.name().startsWith("UTF-16"), + "ASCII HTML without meta tag must not be detected as UTF-16, got: " + + top.name()); + } + } + + // ----------------------------------------------------------------------- + // TXTParser regression + // ----------------------------------------------------------------------- + + /** + * A plain-English paragraph whose bytes are all in the ASCII range must + * be returned as {@code windows-1252} — the HTML5/WHATWG default for + * unlabeled 8-bit Western content and the statistical fallback for + * pure-ASCII bytes in the ML-based detector chain. + */ + @Test + public void englishPlainTextIsDetectedAsWindows1252() throws Exception { + DefaultEncodingDetector detector = new DefaultEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get(ENGLISH_TEXT)) { + List<EncodingResult> results = + detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), "detector returned no result for English text"); + Charset top = results.get(0).getCharset(); + assertEquals("windows-1252", top.name(), + "Pure-ASCII English text should be detected as windows-1252, got: " + + top.name()); + } + } + + // ----------------------------------------------------------------------- + // Shift-JIS ZIP entry name + // ----------------------------------------------------------------------- + + /** + * 9 raw bytes encoding {@code 文章1.txt} in Shift-JIS must be identified + * as {@code Shift_JIS}. + * + * <p>The same bytes are structurally valid Big5-HKSCS and ranked higher by + * the raw ML logits. CharSoup must override the model ranking using the + * Japanese language signal. ZipParser feeds entry names as raw byte arrays + * to the encoding detector, so a wrong answer here means garbled filenames + * in Japanese zip archives.</p> + */ + @Test + public void sjisZipEntryNameIsDetectedAsShiftJis() throws Exception { + DefaultEncodingDetector detector = new DefaultEncodingDetector(); + try (TikaInputStream tis = TikaInputStream.get(SJIS_RAW)) { + List<EncodingResult> results = + detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), + "detector returned no result for SJIS filename bytes"); + Charset top = results.get(0).getCharset(); + assertEquals("Shift_JIS", top.name(), + "SJIS zip entry bytes should be detected as Shift_JIS, got: " + top.name()); + } + } + + // ----------------------------------------------------------------------- + + private static byte[] hexToBytes(String hex) { + byte[] b = new byte[hex.length() / 2]; + for (int i = 0; i < b.length; i++) { + b[i] = (byte) Integer.parseInt(hex.substring(i * 2, i * 2 + 2), 16); + } + return b; + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java new file mode 100644 index 0000000000..33a6bb38fd --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java @@ -0,0 +1,708 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +/** + * Dense INT8 generative character n-gram model for languageness scoring. + * + * <p>Computes an approximate per-n-gram average log P(text | language). + * Higher scores indicate the decoded text is more consistent with the named + * language. The score is used to arbitrate between candidate charsets when + * statistical decoders disagree on script or language. + * + * <h3>Feature types</h3> + * <ul> + * <li><b>CJK languages</b> (Han, Hiragana, Katakana): character unigrams + * and bigrams extracted from CJK/kana codepoints.</li> + * <li><b>Non-CJK languages</b>: character unigrams, bigrams (with + * word-boundary sentinels), and trigrams (with sentinels).</li> + * </ul> + * + * <p>Log-probabilities are quantized to unsigned INT8 over the range + * [{@link #LOGP_MIN}, 0] and stored in dense byte arrays. + * + * <h3>Binary format ({@code GLM1} v2)</h3> + * <pre> + * INT magic = 0x474C4D31 + * INT version = 2 + * INT numLangs + * INT cjkUnigramBuckets + * INT cjkBigramBuckets + * INT noncjkUnigramBuckets + * INT noncjkBigramBuckets + * INT noncjkTrigramBuckets + * For each language: + * SHORT codeLen + * BYTES langCode (UTF-8) + * BYTE isCjk (0|1) + * FLOAT scoreMean (μ of score distribution on training data) + * FLOAT scoreStdDev (σ of score distribution on training data) + * BYTES unigramTable [cjkUnigramBuckets | noncjkUnigramBuckets] + * BYTES bigramTable [cjkBigramBuckets | noncjkBigramBuckets] + * BYTES trigramTable [noncjkTrigramBuckets] (absent for CJK) + * </pre> + */ +public class GenerativeLanguageModel { + + // ---- Bucket counts ---- + + public static final int CJK_UNIGRAM_BUCKETS = 8_192; + public static final int CJK_BIGRAM_BUCKETS = 32_768; + public static final int NONCJK_UNIGRAM_BUCKETS = 8_192; + public static final int NONCJK_BIGRAM_BUCKETS = 8_192; + public static final int NONCJK_TRIGRAM_BUCKETS = 16_384; + + /** + * Quantization floor. Log-probabilities below this value are clamped + * before quantizing; values stored in the table never go lower. + */ + public static final float LOGP_MIN = -18.0f; + + private static final int MAGIC = 0x474C4D31; // "GLM1" + private static final int VERSION = 2; + + // ---- FNV-1a basis constants ---- + + /** + * Bigram basis shared with {@link ScriptAwareFeatureExtractor} so that + * identical text produces the same bucket indices for both models. + */ + static final int BIGRAM_BASIS = ScriptAwareFeatureExtractor.BIGRAM_BASIS; + + /** + * CJK unigram basis shared with {@link ScriptAwareFeatureExtractor}. + */ + static final int CJK_UNIGRAM_BASIS = ScriptAwareFeatureExtractor.UNIGRAM_BASIS; + + /** Distinct salt for non-CJK character unigrams (not in discriminative model). */ + static final int NONCJK_UNIGRAM_BASIS = 0x1a3f7c4e; + + /** Distinct salt for character trigrams (not in discriminative model). */ + static final int TRIGRAM_BASIS = 0x7e3d9b21; + + /** Word-boundary sentinel codepoint, matching the discriminative model. */ + static final int SENTINEL = '_'; + + // ---- Model state ---- + + private final List<String> langIds; + private final Map<String, Integer> langIndex; + private final boolean[] isCjk; + private final byte[][] unigramTables; // [langIdx][bucket] + private final byte[][] bigramTables; // [langIdx][bucket] + private final byte[][] trigramTables; // [langIdx][bucket]; null entry for CJK langs + private final float[] scoreMeans; // μ per language (from training data) + private final float[] scoreStdDevs; // σ per language (from training data) + + private GenerativeLanguageModel( + List<String> langIds, + boolean[] isCjk, + byte[][] unigramTables, + byte[][] bigramTables, + byte[][] trigramTables, + float[] scoreMeans, + float[] scoreStdDevs) { + this.langIds = Collections.unmodifiableList(new ArrayList<>(langIds)); + this.isCjk = isCjk; + this.unigramTables = unigramTables; + this.bigramTables = bigramTables; + this.trigramTables = trigramTables; + this.scoreMeans = scoreMeans; + this.scoreStdDevs = scoreStdDevs; + Map<String, Integer> idx = new HashMap<>(langIds.size() * 2); + for (int i = 0; i < langIds.size(); i++) { + idx.put(langIds.get(i), i); + } + this.langIndex = Collections.unmodifiableMap(idx); + } + + // ---- Public API ---- + + public List<String> getLanguages() { + return langIds; + } + + public boolean isCjk(String language) { + Integer i = langIndex.get(language); + return i != null && isCjk[i]; + } + + /** + * Per-n-gram average log-probability of {@code text} under {@code language}. + * + * @return a value in [{@link #LOGP_MIN}, 0], or {@link Float#NaN} if the + * language is unknown or the text yields no scorable n-grams. + */ + public float score(String text, String language) { + if (text == null || text.isEmpty()) { + return Float.NaN; + } + Integer li = langIndex.get(language); + if (li == null) { + return Float.NaN; + } + String preprocessed = CharSoupFeatureExtractor.preprocess(text); + if (preprocessed.isEmpty()) { + return Float.NaN; + } + + double[] sum = {0.0}; + int[] cnt = {0}; + + if (isCjk[li]) { + byte[] uniT = unigramTables[li]; + byte[] biT = bigramTables[li]; + extractCjkNgrams(preprocessed, + h -> { + sum[0] += dequantize(uniT[h % CJK_UNIGRAM_BUCKETS]); + cnt[0]++; + }, + h -> { + sum[0] += dequantize(biT[h % CJK_BIGRAM_BUCKETS]); + cnt[0]++; + }); + } else { + byte[] uniT = unigramTables[li]; + byte[] biT = bigramTables[li]; + byte[] triT = trigramTables[li]; + extractNonCjkNgrams(preprocessed, + h -> { + sum[0] += dequantize(uniT[h % NONCJK_UNIGRAM_BUCKETS]); + cnt[0]++; + }, + h -> { + sum[0] += dequantize(biT[h % NONCJK_BIGRAM_BUCKETS]); + cnt[0]++; + }, + h -> { + sum[0] += dequantize(triT[h % NONCJK_TRIGRAM_BUCKETS]); + cnt[0]++; + }); + } + + return cnt[0] == 0 ? Float.NaN : (float) (sum[0] / cnt[0]); + } + + /** + * Score {@code text} against all languages and return the best match. + * + * @return an entry {@code (languageCode, score)}, or {@code null} if no + * language yields a finite score. + */ + public Map.Entry<String, Float> bestMatch(String text) { + String best = null; + float bestScore = Float.NEGATIVE_INFINITY; + for (String lang : langIds) { + float s = score(text, lang); + if (!Float.isNaN(s) && s > bestScore) { + bestScore = s; + best = lang; + } + } + return best == null ? null : Map.entry(best, bestScore); + } + + /** + * Z-score of {@code text} under {@code language}: + * {@code (score(text, language) - μ) / σ}, where μ and σ were computed + * from the language's training corpus. + * + * <p>Appropriate when the input text is roughly the same length as + * training sentences. For short or variable-length text, prefer + * {@link #zScoreLengthAdjusted}. + * + * @return the z-score, or {@link Float#NaN} if the language is unknown, + * the text yields no scorable n-grams, or σ is zero/uncalibrated. + */ + public float zScore(String text, String language) { + Integer li = langIndex.get(language); + if (li == null || scoreStdDevs[li] <= 0.0f) { + return Float.NaN; + } + float s = score(text, language); + if (Float.isNaN(s)) { + return Float.NaN; + } + return (s - scoreMeans[li]) / scoreStdDevs[li]; + } + + /** + * Approximate character length of a typical training sentence. + * Used by {@link #zScoreLengthAdjusted} to inflate σ for short text. + * Empirically derived from the calibration data: score σ scales as + * roughly 1/√(charLen) and stabilises around this length. + */ + static final int CALIBRATION_CHAR_LENGTH = 120; + + /** Floor on text length to avoid extreme σ inflation. */ + static final int MIN_ADJUSTED_CHAR_LENGTH = 10; + + /** + * Length-adjusted z-score of {@code text} under {@code language}. + * + * <p>Score variance scales as approximately 1/√(textLength). The + * stored σ was calibrated on full training sentences (typically + * ~{@value #CALIBRATION_CHAR_LENGTH} characters). For shorter text + * this method inflates σ proportionally, preventing spurious low + * z-scores on short snippets. For text at or above the calibration + * length, the result equals {@link #zScore}. + * + * @return the adjusted z-score, or {@link Float#NaN} if the language + * is unknown, the text yields no scorable n-grams, or σ is + * zero/uncalibrated. + */ + public float zScoreLengthAdjusted(String text, String language) { + Integer li = langIndex.get(language); + if (li == null || scoreStdDevs[li] <= 0.0f) { + return Float.NaN; + } + float s = score(text, language); + if (Float.isNaN(s)) { + return Float.NaN; + } + int textLen = text.length(); + float adjustment = (float) Math.sqrt( + (double) CALIBRATION_CHAR_LENGTH + / Math.max(textLen, MIN_ADJUSTED_CHAR_LENGTH)); + float adjustedSigma = scoreStdDevs[li] * Math.max(1.0f, adjustment); + return (s - scoreMeans[li]) / adjustedSigma; + } + + /** + * Set the calibration statistics for a language. Typically called by + * the training tool after a second pass over the training corpus. + */ + public void setStats(String language, float mean, float stdDev) { + Integer li = langIndex.get(language); + if (li == null) { + throw new IllegalArgumentException("Unknown language: " + language); + } + scoreMeans[li] = mean; + scoreStdDevs[li] = stdDev; + } + + // ---- N-gram extraction (shared by scoring and training) ---- + + /** + * Callback receiving a non-negative raw FNV hash for a single n-gram. + * The caller is responsible for reducing it modulo a table size. + */ + @FunctionalInterface + public interface HashConsumer { + void consume(int hash); + } + + /** + * Extract CJK character unigrams and bigrams from preprocessed text, + * delivering raw (positive) hashes to the supplied sinks. + */ + public static void extractCjkNgrams( + String text, + HashConsumer unigramSink, + HashConsumer bigramSink) { + int prevCp = -1; + int i = 0; + int len = text.length(); + while (i < len) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + if (!Character.isLetter(cp)) { + prevCp = -1; + continue; + } + int lower = Character.toLowerCase(cp); + if (!ScriptAwareFeatureExtractor.isCjkOrKana(lower)) { + prevCp = -1; + continue; + } + int script = ScriptCategory.of(lower); + unigramSink.consume(cjkUnigramHash(script, lower)); + if (prevCp >= 0) { + bigramSink.consume(bigramHash(script, prevCp, lower)); + } + prevCp = lower; + } + } + + /** + * Extract non-CJK character unigrams, sentinel-padded bigrams, and + * sentinel-padded trigrams from preprocessed text. + * + * <p>A "word" is a maximal run of non-CJK letter codepoints within the + * same script family. Sentinels ({@link #SENTINEL}) pad each word on + * both sides, so a word of length L yields L+1 bigrams and L+2 trigrams. + */ + public static void extractNonCjkNgrams( + String text, + HashConsumer unigramSink, + HashConsumer bigramSink, + HashConsumer trigramSink) { + int prevPrev = SENTINEL; + int prev = SENTINEL; + int prevScript = -1; + boolean inWord = false; + + int i = 0; + int len = text.length(); + while (i < len) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + + if (cp >= 0x0300 && CharSoupFeatureExtractor.isTransparent(cp)) { + continue; + } + + if (Character.isLetter(cp)) { + int lower = Character.toLowerCase(cp); + if (ScriptAwareFeatureExtractor.isCjkOrKana(lower)) { + if (inWord) { + emitWordEnd(prevScript, prevPrev, prev, bigramSink, trigramSink); + inWord = false; + prevPrev = SENTINEL; + prev = SENTINEL; + prevScript = -1; + } + continue; + } + int script = ScriptCategory.of(lower); + + if (inWord && script != prevScript) { + // Script change is a word boundary + emitWordEnd(prevScript, prevPrev, prev, bigramSink, trigramSink); + inWord = false; + prevPrev = SENTINEL; + prev = SENTINEL; + } + + unigramSink.consume(noncjkUnigramHash(script, lower)); + + if (!inWord) { + // Leading sentinels + bigramSink.consume(bigramHash(script, SENTINEL, lower)); + trigramSink.consume(trigramHash(script, SENTINEL, SENTINEL, lower)); + prevPrev = SENTINEL; + } else { + bigramSink.consume(bigramHash(script, prev, lower)); + trigramSink.consume(trigramHash(script, prevPrev, prev, lower)); + prevPrev = prev; + } + prev = lower; + prevScript = script; + inWord = true; + } else { + if (inWord) { + emitWordEnd(prevScript, prevPrev, prev, bigramSink, trigramSink); + inWord = false; + prevPrev = SENTINEL; + prev = SENTINEL; + prevScript = -1; + } + } + } + + if (inWord) { + emitWordEnd(prevScript, prevPrev, prev, bigramSink, trigramSink); + } + } + + private static void emitWordEnd( + int script, int pp, int p, + HashConsumer bigramSink, HashConsumer trigramSink) { + bigramSink.consume(bigramHash(script, p, SENTINEL)); + trigramSink.consume(trigramHash(script, pp, p, SENTINEL)); + trigramSink.consume(trigramHash(script, p, SENTINEL, SENTINEL)); + } + + // ---- Hash functions (FNV-1a) ---- + + static int cjkUnigramHash(int script, int cp) { + int h = CJK_UNIGRAM_BASIS; + h = fnvByte(h, script); + h = fnvInt(h, cp); + return h & 0x7FFFFFFF; + } + + static int noncjkUnigramHash(int script, int cp) { + int h = NONCJK_UNIGRAM_BASIS; + h = fnvByte(h, script); + h = fnvInt(h, cp); + return h & 0x7FFFFFFF; + } + + static int bigramHash(int script, int cp1, int cp2) { + int h = BIGRAM_BASIS; + h = fnvByte(h, script); + h = fnvInt(h, cp1); + h = fnvInt(h, cp2); + return h & 0x7FFFFFFF; + } + + static int trigramHash(int script, int cp1, int cp2, int cp3) { + int h = TRIGRAM_BASIS; + h = fnvByte(h, script); + h = fnvInt(h, cp1); + h = fnvInt(h, cp2); + h = fnvInt(h, cp3); + return h & 0x7FFFFFFF; + } + + private static int fnvByte(int h, int b) { + return (h ^ (b & 0xFF)) * 0x01000193; + } + + private static int fnvInt(int h, int v) { + h = (h ^ (v & 0xFF)) * 0x01000193; + h = (h ^ ((v >>> 8) & 0xFF)) * 0x01000193; + h = (h ^ ((v >>> 16) & 0xFF)) * 0x01000193; + h = (h ^ ((v >>> 24) & 0xFF)) * 0x01000193; + return h; + } + + // ---- Quantization ---- + + /** + * Quantize a log-probability in [{@link #LOGP_MIN}, 0] to an unsigned byte + * value: 0 maps to {@code LOGP_MIN}, 255 maps to 0. + */ + static byte quantize(float logP) { + float clamped = Math.max(LOGP_MIN, Math.min(0.0f, logP)); + return (byte) Math.round((clamped - LOGP_MIN) / (-LOGP_MIN) * 255.0f); + } + + /** Inverse of {@link #quantize}. */ + static float dequantize(byte b) { + return (b & 0xFF) / 255.0f * (-LOGP_MIN) + LOGP_MIN; + } + + // ---- Serialization ---- + + /** + * Deserialize a model from the GLM1 binary format. + */ + public static GenerativeLanguageModel load(InputStream is) throws IOException { + DataInputStream din = new DataInputStream(new BufferedInputStream(is)); + + int magic = din.readInt(); + if (magic != MAGIC) { + throw new IOException("Not a GLM1 file (bad magic)"); + } + int version = din.readInt(); + if (version != 1 && version != VERSION) { + throw new IOException("Unsupported GLM version: " + version); + } + boolean hasStats = version >= 2; + + int numLangs = din.readInt(); + int cjkUni = din.readInt(); + int cjkBi = din.readInt(); + int noncjkUni = din.readInt(); + int noncjkBi = din.readInt(); + int noncjkTri = din.readInt(); + + List<String> langIds = new ArrayList<>(numLangs); + boolean[] isCjk = new boolean[numLangs]; + byte[][] unigramTables = new byte[numLangs][]; + byte[][] bigramTables = new byte[numLangs][]; + byte[][] trigramTables = new byte[numLangs][]; + float[] means = new float[numLangs]; + float[] stdDevs = new float[numLangs]; + + for (int i = 0; i < numLangs; i++) { + int codeLen = din.readUnsignedShort(); + byte[] codeBytes = new byte[codeLen]; + din.readFully(codeBytes); + langIds.add(new String(codeBytes, StandardCharsets.UTF_8)); + + isCjk[i] = din.readByte() != 0; + + if (hasStats) { + means[i] = din.readFloat(); + stdDevs[i] = din.readFloat(); + } + + int uniSize = isCjk[i] ? cjkUni : noncjkUni; + int biSize = isCjk[i] ? cjkBi : noncjkBi; + + unigramTables[i] = new byte[uniSize]; + din.readFully(unigramTables[i]); + + bigramTables[i] = new byte[biSize]; + din.readFully(bigramTables[i]); + + if (!isCjk[i]) { + trigramTables[i] = new byte[noncjkTri]; + din.readFully(trigramTables[i]); + } + } + + return new GenerativeLanguageModel(langIds, isCjk, + unigramTables, bigramTables, trigramTables, + means, stdDevs); + } + + /** + * Serialize this model to the GLM1 binary format. + */ + public void save(OutputStream os) throws IOException { + DataOutputStream dout = new DataOutputStream(new BufferedOutputStream(os)); + + dout.writeInt(MAGIC); + dout.writeInt(VERSION); + dout.writeInt(langIds.size()); + dout.writeInt(CJK_UNIGRAM_BUCKETS); + dout.writeInt(CJK_BIGRAM_BUCKETS); + dout.writeInt(NONCJK_UNIGRAM_BUCKETS); + dout.writeInt(NONCJK_BIGRAM_BUCKETS); + dout.writeInt(NONCJK_TRIGRAM_BUCKETS); + + for (int i = 0; i < langIds.size(); i++) { + byte[] codeBytes = langIds.get(i).getBytes(StandardCharsets.UTF_8); + dout.writeShort(codeBytes.length); + dout.write(codeBytes); + dout.writeByte(isCjk[i] ? 1 : 0); + dout.writeFloat(scoreMeans[i]); + dout.writeFloat(scoreStdDevs[i]); + dout.write(unigramTables[i]); + dout.write(bigramTables[i]); + if (!isCjk[i]) { + dout.write(trigramTables[i]); + } + } + dout.flush(); + } + + // ---- Builder ---- + + public static Builder builder() { + return new Builder(); + } + + /** + * Accumulates training samples per language and produces a + * {@link GenerativeLanguageModel} via add-k smoothing. + */ + public static class Builder { + + private final Map<String, Boolean> cjkFlags = new LinkedHashMap<>(); + private final Map<String, long[]> unigramCounts = new HashMap<>(); + private final Map<String, long[]> bigramCounts = new HashMap<>(); + private final Map<String, long[]> trigramCounts = new HashMap<>(); + + /** + * Register a language before feeding it samples. Must be called + * before {@link #addSample(String, String)}. + */ + public Builder registerLanguage(String langCode, boolean isCjk) { + cjkFlags.put(langCode, isCjk); + unigramCounts.put(langCode, + new long[isCjk ? CJK_UNIGRAM_BUCKETS : NONCJK_UNIGRAM_BUCKETS]); + bigramCounts.put(langCode, + new long[isCjk ? CJK_BIGRAM_BUCKETS : NONCJK_BIGRAM_BUCKETS]); + if (!isCjk) { + trigramCounts.put(langCode, new long[NONCJK_TRIGRAM_BUCKETS]); + } + return this; + } + + /** + * Add a text sample for the named language. The language must have + * been registered via {@link #registerLanguage} first. + */ + public Builder addSample(String langCode, String text) { + Boolean cjk = cjkFlags.get(langCode); + if (cjk == null) { + throw new IllegalArgumentException("Unknown language: " + langCode); + } + String pp = CharSoupFeatureExtractor.preprocess(text); + if (pp.isEmpty()) { + return this; + } + + long[] ug = unigramCounts.get(langCode); + long[] bg = bigramCounts.get(langCode); + + if (cjk) { + extractCjkNgrams(pp, + h -> ug[h % CJK_UNIGRAM_BUCKETS]++, + h -> bg[h % CJK_BIGRAM_BUCKETS]++); + } else { + long[] tg = trigramCounts.get(langCode); + extractNonCjkNgrams(pp, + h -> ug[h % NONCJK_UNIGRAM_BUCKETS]++, + h -> bg[h % NONCJK_BIGRAM_BUCKETS]++, + h -> tg[h % NONCJK_TRIGRAM_BUCKETS]++); + } + return this; + } + + /** + * Finalize training with add-{@code k} smoothing and return the model. + * + * @param addK smoothing constant; 0.01 is a reasonable default + */ + public GenerativeLanguageModel build(float addK) { + List<String> ids = new ArrayList<>(cjkFlags.keySet()); + int n = ids.size(); + + boolean[] cjkArr = new boolean[n]; + byte[][] uniTables = new byte[n][]; + byte[][] biTables = new byte[n][]; + byte[][] triTables = new byte[n][]; + + for (int i = 0; i < n; i++) { + String lang = ids.get(i); + cjkArr[i] = cjkFlags.get(lang); + uniTables[i] = toLogProbTable(unigramCounts.get(lang), addK); + biTables[i] = toLogProbTable(bigramCounts.get(lang), addK); + if (!cjkArr[i]) { + triTables[i] = toLogProbTable(trigramCounts.get(lang), addK); + } + } + return new GenerativeLanguageModel(ids, cjkArr, uniTables, biTables, triTables, + new float[n], new float[n]); + } + + private static byte[] toLogProbTable(long[] counts, float addK) { + long total = 0; + for (long c : counts) { + total += c; + } + double denom = total + (double) addK * counts.length; + byte[] table = new byte[counts.length]; + for (int i = 0; i < counts.length; i++) { + double p = (counts[i] + addK) / denom; + table[i] = quantize((float) Math.log(p)); + } + return table; + } + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java index 78758a62d6..2680c11a70 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java @@ -278,7 +278,7 @@ public class ScriptAwareFeatureExtractor implements FeatureExtractor { == Character.SPACE_SEPARATOR; } - static boolean isCjkOrKana(int cp) { + public static boolean isCjkOrKana(int cp) { if (Character.isIdeographic(cp)) { return true; } diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java new file mode 100644 index 0000000000..b6e44eacc6 --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusDiversityAnalyzer.java @@ -0,0 +1,274 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.BufferedReader; +import java.io.IOException; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Locale; +import java.util.Set; + +import org.apache.tika.langdetect.charsoup.CharSoupFeatureExtractor; +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; +import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor; + +/** + * Measures corpus diversity for each language in a flat-file corpus directory. + * + * <p>Three complementary metrics are computed entirely from the training + * sentences — no external evaluation set required: + * + * <ol> + * <li><b>Bigram bucket fill %</b>: fraction of the bigram hash table that + * has at least one count after seeing all training sentences. A corpus + * of near-identical stubs reuses the same n-grams over and over and + * fills a small fraction of buckets regardless of corpus size.</li> + * <li><b>Normalised bigram entropy</b>: Shannon entropy of the bigram count + * distribution divided by log2(filled buckets). A perfectly uniform + * distribution scores 1.0; a corpus dominated by a handful of repeated + * patterns scores near 0.</li> + * <li><b>Unique sentence %</b>: fraction of distinct lines. Templated + * corpora have many near- or exact-duplicate sentences.</li> + * </ol> + * + * <p>Languages whose fill% and entropy fall far below the median are flagged + * as potentially low-quality. + * + * <h3>Usage</h3> + * <pre> + * java CorpusDiversityAnalyzer \ + * --corpus /path/to/pool_filtered \ + * [--max-per-lang 100000] \ + * [--flag-below 0.5] + * </pre> + */ +public class CorpusDiversityAnalyzer { + + private static final int DEFAULT_MAX_PER_LANG = 100_000; + private static final double DEFAULT_FLAG_BELOW = 0.5; + + public static void main(String[] args) throws Exception { + Path corpus = null; + int maxPerLang = DEFAULT_MAX_PER_LANG; + double flagBelow = DEFAULT_FLAG_BELOW; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--corpus": + corpus = Paths.get(args[++i]); + break; + case "--max-per-lang": + maxPerLang = Integer.parseInt(args[++i]); + break; + case "--flag-below": + flagBelow = Double.parseDouble(args[++i]); + break; + default: + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + } + if (corpus == null) { + System.err.println("Usage: CorpusDiversityAnalyzer --corpus <dir> " + + "[--max-per-lang N] [--flag-below 0.5]"); + System.exit(1); + } + + List<Path> langPaths = listRegularFiles(corpus); + System.out.printf(Locale.US, "Analysing %d languages in %s " + + "(max %,d sentences each)%n%n", + langPaths.size(), corpus, maxPerLang); + + System.out.printf(Locale.US, + "%-14s %10s %10s %8s %10s %10s %s%n", + "Language", "Sentences", "Unique%", + "Fill%", "Entropy", "NormEntropy", "Flag"); + System.out.println("-".repeat(80)); + + List<LangStats> stats = new ArrayList<>(); + for (Path p : langPaths) { + LangStats s = analyze(p, maxPerLang); + stats.add(s); + } + + // Sort by normalised entropy ascending (worst first) + stats.sort((a, b) -> Double.compare(a.normEntropy, b.normEntropy)); + + for (LangStats s : stats) { + String flag = (s.fillPct < flagBelow * 100 + || s.normEntropy < flagBelow) ? " <<< LOW DIVERSITY" : ""; + System.out.printf(Locale.US, + "%-14s %,10d %9.1f%% %7.1f%% %9.3f %11.3f %s%n", + s.lang, s.sentences, s.uniquePct, + s.fillPct, s.entropy, s.normEntropy, flag); + } + } + + // ---- Analysis ---- + + static LangStats analyze(Path langFile, int maxPerLang) throws IOException { + String lang = langFile.getFileName().toString(); + + // Determine CJK by probing first 200 sentences + boolean cjk = probeCjk(langFile, 200); + + int numBuckets = cjk + ? GenerativeLanguageModel.CJK_BIGRAM_BUCKETS + : GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS; + + long[] bigramCounts = new long[numBuckets]; + Set<String> seen = new HashSet<>(); + long sentences = 0; + long uniqueSentences = 0; + + try (BufferedReader reader = Files.newBufferedReader( + langFile, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + String text = line.trim(); + if (text.isEmpty()) continue; + + String pp = CharSoupFeatureExtractor.preprocess(text); + if (pp.isEmpty()) continue; + + if (seen.add(text)) { + uniqueSentences++; + } + sentences++; + + if (cjk) { + GenerativeLanguageModel.extractCjkNgrams(pp, + h -> { /* skip unigrams */ }, + h -> bigramCounts[h + % GenerativeLanguageModel.CJK_BIGRAM_BUCKETS]++); + } else { + GenerativeLanguageModel.extractNonCjkNgrams(pp, + h -> { /* skip unigrams */ }, + h -> bigramCounts[h + % GenerativeLanguageModel.NONCJK_BIGRAM_BUCKETS]++, + h -> { /* skip trigrams */ }); + } + + if (maxPerLang > 0 && sentences >= maxPerLang) { + break; + } + } + } + + // Metrics + long filledBuckets = 0; + long total = 0; + for (long c : bigramCounts) { + if (c > 0) { + filledBuckets++; + total += c; + } + } + + double fillPct = 100.0 * filledBuckets / numBuckets; + + // Shannon entropy over filled buckets (bits) + double entropy = 0.0; + if (total > 0) { + for (long c : bigramCounts) { + if (c > 0) { + double p = (double) c / total; + entropy -= p * (Math.log(p) / Math.log(2)); + } + } + } + + // Normalised entropy: H / log2(filledBuckets) ∈ [0, 1] + double normEntropy = filledBuckets > 1 + ? entropy / (Math.log(filledBuckets) / Math.log(2)) : 0.0; + + double uniquePct = sentences > 0 + ? 100.0 * uniqueSentences / sentences : 0.0; + + return new LangStats(lang, sentences, uniquePct, + fillPct, entropy, normEntropy); + } + + // ---- Helpers ---- + + private static boolean probeCjk(Path file, int maxLines) throws IOException { + long cjk = 0; + long total = 0; + int lines = 0; + try (BufferedReader reader = Files.newBufferedReader( + file, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null && lines < maxLines) { + int i = 0; + while (i < line.length()) { + int cp = line.codePointAt(i); + i += Character.charCount(cp); + if (Character.isLetter(cp)) { + total++; + if (ScriptAwareFeatureExtractor.isCjkOrKana( + Character.toLowerCase(cp))) { + cjk++; + } + } + } + lines++; + } + } + return total > 0 && (double) cjk / total >= 0.60; + } + + private static List<Path> listRegularFiles(Path dir) throws IOException { + List<Path> files = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream( + dir, Files::isRegularFile)) { + for (Path p : stream) { + files.add(p); + } + } + Collections.sort(files); + return files; + } + + // ---- Result record ---- + + static class LangStats { + final String lang; + final long sentences; + final double uniquePct; + final double fillPct; + final double entropy; + final double normEntropy; + + LangStats(String lang, long sentences, double uniquePct, + double fillPct, double entropy, double normEntropy) { + this.lang = lang; + this.sentences = sentences; + this.uniquePct = uniquePct; + this.fillPct = fillPct; + this.entropy = entropy; + this.normEntropy = normEntropy; + } + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java new file mode 100644 index 0000000000..6acd5573a9 --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/CorpusFilterReport.java @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; + +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; + +/** + * Scores every sentence in a training corpus against its own language model + * and reports how many would be dropped at various z-score thresholds. + * + * <h3>Usage</h3> + * <pre> + * java CorpusFilterReport \ + * --model generative.bin \ + * --corpus /path/to/pool_filtered \ + * [--max-per-lang 500000] \ + * [--show-drops 10] (print N worst-scoring sentences per language) + * </pre> + */ +public class CorpusFilterReport { + + private static final int DEFAULT_MAX_PER_LANG = 500_000; + private static final int DEFAULT_SHOW_DROPS = 0; + + public static void main(String[] args) throws Exception { + Path modelPath = null; + Path corpusPath = null; + int maxPerLang = DEFAULT_MAX_PER_LANG; + int showDrops = DEFAULT_SHOW_DROPS; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--corpus": + corpusPath = Paths.get(args[++i]); + break; + case "--max-per-lang": + maxPerLang = Integer.parseInt(args[++i]); + break; + case "--show-drops": + showDrops = Integer.parseInt(args[++i]); + break; + default: + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + } + + if (modelPath == null || corpusPath == null) { + System.err.println( + "Usage: CorpusFilterReport --model <bin> --corpus <dir> " + + "[--max-per-lang N] [--show-drops N]"); + System.exit(1); + } + + GenerativeLanguageModel model; + try (InputStream is = new FileInputStream(modelPath.toFile())) { + model = GenerativeLanguageModel.load(is); + } + + boolean flat = isFlatLayout(corpusPath); + List<Path> langPaths = listLangPaths(corpusPath, flat); + + System.out.printf(Locale.US, + "%-14s %8s %8s %8s %8s %8s %8s%n", + "Language", "Total", "z<-2", "z<-3", "z<-4", "z<-2%", "z<-3%"); + System.out.println("-".repeat(80)); + + long grandTotal = 0; + long grandDrop2 = 0; + long grandDrop3 = 0; + long grandDrop4 = 0; + + for (Path langPath : langPaths) { + String lang = langPath.getFileName().toString(); + if (!model.getLanguages().contains(lang)) { + continue; + } + + List<ScoredLine> scored = scoreLang( + model, lang, langPath, flat, maxPerLang); + + long total = scored.size(); + long drop2 = 0; + long drop3 = 0; + long drop4 = 0; + for (ScoredLine sl : scored) { + if (sl.z < -2) { + drop2++; + } + if (sl.z < -3) { + drop3++; + } + if (sl.z < -4) { + drop4++; + } + } + + System.out.printf(Locale.US, + "%-14s %,8d %,8d %,8d %,8d %7.2f%% %7.2f%%%n", + lang, total, drop2, drop3, drop4, + 100.0 * drop2 / total, 100.0 * drop3 / total); + + if (showDrops > 0 && drop3 > 0) { + scored.sort((a, b) -> Float.compare(a.z, b.z)); + int n = (int) Math.min(showDrops, drop3); + for (int i = 0; i < n; i++) { + ScoredLine sl = scored.get(i); + String preview = sl.text.length() > 80 + ? sl.text.substring(0, 80) + "…" : sl.text; + System.out.printf(Locale.US, + " z=%6.2f %s%n", sl.z, preview); + } + } + + grandTotal += total; + grandDrop2 += drop2; + grandDrop3 += drop3; + grandDrop4 += drop4; + } + + System.out.println("-".repeat(80)); + System.out.printf(Locale.US, + "%-14s %,8d %,8d %,8d %,8d %7.2f%% %7.2f%%%n", + "TOTAL", grandTotal, grandDrop2, grandDrop3, grandDrop4, + 100.0 * grandDrop2 / grandTotal, 100.0 * grandDrop3 / grandTotal); + } + + private static List<ScoredLine> scoreLang( + GenerativeLanguageModel model, String lang, + Path langPath, boolean flat, int maxPerLang) throws Exception { + + List<ScoredLine> result = new ArrayList<>(); + + if (flat) { + try (BufferedReader reader = Files.newBufferedReader( + langPath, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + String text = line.trim(); + if (text.isEmpty()) { + continue; + } + float z = model.zScore(text, lang); + if (!Float.isNaN(z)) { + result.add(new ScoredLine(text, z)); + } + if (maxPerLang > 0 && result.size() >= maxPerLang) { + break; + } + } + } + } else { + List<Path> files = listTxtFiles(langPath); + outer: + for (Path file : files) { + try (BufferedReader reader = Files.newBufferedReader( + file, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + int tab = line.indexOf('\t'); + if (tab < 0) { + continue; + } + String text = line.substring(tab + 1).trim(); + if (text.isEmpty()) { + continue; + } + float z = model.zScore(text, lang); + if (!Float.isNaN(z)) { + result.add(new ScoredLine(text, z)); + } + if (maxPerLang > 0 && result.size() >= maxPerLang) { + break outer; + } + } + } + } + } + return result; + } + + private static boolean isFlatLayout(Path dir) throws Exception { + try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir)) { + for (Path p : stream) { + return Files.isRegularFile(p); + } + } + return true; + } + + private static List<Path> listLangPaths(Path dir, boolean flat) throws Exception { + List<Path> paths = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, + p -> flat ? Files.isRegularFile(p) : Files.isDirectory(p))) { + for (Path p : stream) { + paths.add(p); + } + } + Collections.sort(paths); + return paths; + } + + private static List<Path> listTxtFiles(Path dir) throws Exception { + List<Path> files = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*.txt")) { + for (Path p : stream) { + files.add(p); + } + } + Collections.sort(files); + return files; + } + + private static class ScoredLine { + final String text; + final float z; + + ScoredLine(String text, float z) { + this.text = text; + this.z = z; + } + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java new file mode 100644 index 0000000000..4a4652dc57 --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/EvalGenerativeModel.java @@ -0,0 +1,365 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Comparator; +import java.util.HashMap; +import java.util.List; +import java.util.Locale; +import java.util.Map; +import java.util.Set; + +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; + +/** + * Self-consistency evaluation for {@link GenerativeLanguageModel}. + * + * <p>For each sentence in the test file, computes {@code score(text, L)} + * for every language in the model and checks whether the argmax equals + * the true label. Reports overall accuracy and per-language accuracy + * sorted from worst to best. + * + * <p>Accepts either: + * <ul> + * <li>Flores-200 TSV: {@code lang_Script TAB text} — script suffixes are + * stripped and FLORES-specific codes are remapped to model codes.</li> + * <li>Standard corpus format: {@code lang TAB text}</li> + * </ul> + * + * <h3>Usage</h3> + * <pre> + * java EvalGenerativeModel \ + * --model generative.bin \ + * --test /path/to/flores200_dev.tsv \ + * [--max-per-lang 997] + * </pre> + */ +public class EvalGenerativeModel { + + private static final int DEFAULT_MAX_PER_LANG = 0; // 0 = unlimited + private static final int DEFAULT_MAX_CHARS = 0; // 0 = full sentence + + // ---- Flores-200 normalisation (mirrors CompareDetectors) ---- + + private static final Set<String> FLORES_KEEP_SCRIPT_SUFFIX = Set.of( + "ace_Arab", "arb_Latn", "bjn_Arab", + "kas_Deva", "knc_Latn", "min_Arab", "taq_Tfng" + ); + + private static final Map<String, String> FLORES_CODE_REMAP; + static { + Map<String, String> m = new HashMap<>(); + m.put("arb", "ara"); + m.put("pes", "fas"); + m.put("zsm", "msa"); + m.put("lvs", "lav"); + m.put("azj", "aze"); + m.put("ekk", "est"); + m.put("npi", "nep"); + m.put("als", "sqi"); + m.put("ory", "ori"); + m.put("nor", "nob"); + m.put("cmn", "zho"); + m.put("swa", "swh"); + m.put("yid", "ydd"); + m.put("gug", "grn"); + m.put("quz", "que"); + m.put("plt", "mlg"); + m.put("pbt", "pus"); + m.put("uzn", "uzb"); + m.put("kmr", "kur"); + m.put("khk", "mon"); + FLORES_CODE_REMAP = m; + } + + static String normalizeLang(String raw) { + if (FLORES_KEEP_SCRIPT_SUFFIX.contains(raw)) { + return raw; + } + int underscore = raw.indexOf('_'); + String base = underscore >= 0 ? raw.substring(0, underscore) : raw; + return FLORES_CODE_REMAP.getOrDefault(base, base); + } + + // ---- Entry point ---- + + public static void main(String[] args) throws Exception { + Path modelPath = null; + Path testPath = null; + int maxPerLang = DEFAULT_MAX_PER_LANG; + int[] maxCharsSet = null; // null = full sentence only + boolean showConfusions = false; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--test": + testPath = Paths.get(args[++i]); + break; + case "--max-per-lang": + maxPerLang = Integer.parseInt(args[++i]); + break; + case "--show-confusions": + showConfusions = true; + break; + case "--lengths": { + String[] parts = args[++i].split(","); + maxCharsSet = new int[parts.length]; + for (int j = 0; j < parts.length; j++) { + maxCharsSet[j] = Integer.parseInt(parts[j].trim()); + } + break; + } + default: + System.err.println("Unknown option: " + args[i]); + printUsage(); + System.exit(1); + } + } + + if (modelPath == null || testPath == null) { + printUsage(); + System.exit(1); + } + + System.out.println("Loading model: " + modelPath); + GenerativeLanguageModel model; + try (InputStream is = new FileInputStream(modelPath.toFile())) { + model = GenerativeLanguageModel.load(is); + } + System.out.printf(Locale.US, " %d languages (%d CJK, %d non-CJK)%n", + model.getLanguages().size(), + model.getLanguages().stream().filter(model::isCjk).count(), + model.getLanguages().stream().filter(l -> !model.isCjk(l)).count()); + + System.out.println("Loading test data: " + testPath); + List<LabeledSentence> data = loadTestFile(testPath); + boolean floresMode = data.stream().anyMatch(s -> s.getLanguage().contains("_")); + if (floresMode) { + System.out.println(" Flores-200 mode: normalizing lang codes"); + List<LabeledSentence> normalized = new ArrayList<>(data.size()); + for (LabeledSentence s : data) { + normalized.add(new LabeledSentence( + normalizeLang(s.getLanguage()), s.getText())); + } + data = normalized; + } + + // Cap per language if requested + if (maxPerLang > 0) { + data = samplePerLang(data, maxPerLang); + } + + Set<String> modelLangs = new java.util.HashSet<>(model.getLanguages()); + + // Split into scorable (true lang is in model) and unscorable + List<LabeledSentence> scorable = new ArrayList<>(); + Map<String, Integer> skipped = new HashMap<>(); + for (LabeledSentence s : data) { + if (modelLangs.contains(s.getLanguage())) { + scorable.add(s); + } else { + skipped.merge(s.getLanguage(), 1, Integer::sum); + } + } + System.out.printf(Locale.US, " %,d sentences; %,d scorable, %,d skipped (%d langs not in model)%n", + data.size(), scorable.size(), + data.size() - scorable.size(), skipped.size()); + if (!skipped.isEmpty()) { + List<String> sk = new ArrayList<>(skipped.keySet()); + java.util.Collections.sort(sk); + System.out.println(" Skipped langs: " + sk); + } + + // Build the set of lengths to evaluate + int[] lengths = maxCharsSet != null ? maxCharsSet : new int[]{0}; + + for (int maxChars : lengths) { + String label = maxChars > 0 ? "@" + maxChars + " chars" : "full"; + List<LabeledSentence> run = maxChars > 0 + ? truncate(scorable, maxChars) : scorable; + + System.out.printf(Locale.US, "%nScoring [%s]…%n", label); + long wallStart = System.nanoTime(); + // confusions: trueLang -> (predictedLang -> count) + Map<String, Map<String, Integer>> confusions = + showConfusions ? new java.util.TreeMap<>() : null; + Map<String, int[]> perLang = evalAll(model, run, confusions); + long elapsedMs = (System.nanoTime() - wallStart) / 1_000_000; + + int totalCorrect = 0; + int totalCount = 0; + for (int[] v : perLang.values()) { + totalCorrect += v[0]; + totalCount += v[1]; + } + + System.out.printf(Locale.US, + "Overall [%s]: %.2f%% (%,d / %,d) in %,dms (%.0f sent/s)%n", + label, 100.0 * totalCorrect / totalCount, + totalCorrect, totalCount, + elapsedMs, totalCount * 1000.0 / elapsedMs); + + List<Map.Entry<String, int[]>> rows = new ArrayList<>(perLang.entrySet()); + rows.sort(Comparator.comparingDouble( + e -> (double) e.getValue()[0] / e.getValue()[1])); + + System.out.printf(Locale.US, "%n%-16s %8s %8s %8s%n", + "Language", "Correct", "Total", "Acc%"); + System.out.println("-".repeat(46)); + for (Map.Entry<String, int[]> e : rows) { + int[] v = e.getValue(); + System.out.printf(Locale.US, "%-16s %8d %8d %7.2f%%%n", + e.getKey(), v[0], v[1], 100.0 * v[0] / v[1]); + } + + System.out.println(); + int[] thresholds = {100, 95, 90, 80, 50}; + for (int t : thresholds) { + long above = rows.stream() + .filter(e -> 100.0 * e.getValue()[0] / e.getValue()[1] >= t) + .count(); + System.out.printf(Locale.US, " >= %3d%% accuracy: %3d / %d languages%n", + t, above, rows.size()); + } + + if (confusions != null) { + System.out.println("\n=== Confusion distributions (wrong predictions only) ==="); + for (Map.Entry<String, Map<String, Integer>> langEntry + : confusions.entrySet()) { + String trueLang = langEntry.getKey(); + Map<String, Integer> preds = langEntry.getValue(); + int total = perLang.get(trueLang)[1]; + int correct = perLang.get(trueLang)[0]; + int wrong = total - correct; + if (wrong == 0) continue; + System.out.printf(Locale.US, + "%n %s (%d wrong / %d total): ", + trueLang, wrong, total); + preds.entrySet().stream() + .sorted(Map.Entry.<String, Integer>comparingByValue() + .reversed()) + .limit(10) + .forEach(e -> System.out.printf(Locale.US, + "%s=%d ", e.getKey(), e.getValue())); + System.out.println(); + } + } + } + } + + // ---- Scoring ---- + + private static Map<String, int[]> evalAll( + GenerativeLanguageModel model, + List<LabeledSentence> data, + Map<String, Map<String, Integer>> confusions) { + Map<String, int[]> perLang = new HashMap<>(); + List<String> allLangs = model.getLanguages(); + + for (LabeledSentence s : data) { + String trueLang = s.getLanguage(); + String predicted = argmax(model, allLangs, s.getText()); + int[] counts = perLang.computeIfAbsent(trueLang, k -> new int[2]); + counts[1]++; + if (trueLang.equals(predicted)) { + counts[0]++; + } else if (confusions != null && predicted != null) { + confusions.computeIfAbsent(trueLang, k -> new HashMap<>()) + .merge(predicted, 1, Integer::sum); + } + } + return perLang; + } + + private static String argmax(GenerativeLanguageModel model, + List<String> langs, String text) { + String best = null; + float bestS = Float.NEGATIVE_INFINITY; + for (String lang : langs) { + float s = model.score(text, lang); + if (!Float.isNaN(s) && s > bestS) { + bestS = s; + best = lang; + } + } + return best; + } + + // ---- I/O helpers ---- + + static List<LabeledSentence> loadTestFile(Path path) throws Exception { + List<LabeledSentence> sentences = new ArrayList<>(); + try (BufferedReader reader = Files.newBufferedReader(path, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + int tab = line.indexOf('\t'); + if (tab < 0) { + continue; + } + String lang = line.substring(0, tab).trim(); + String text = line.substring(tab + 1).trim(); + if (!lang.isEmpty() && !text.isEmpty()) { + sentences.add(new LabeledSentence(lang, text)); + } + } + } + return sentences; + } + + private static List<LabeledSentence> truncate( + List<LabeledSentence> data, int maxChars) { + List<LabeledSentence> result = new ArrayList<>(data.size()); + for (LabeledSentence s : data) { + String t = s.getText(); + result.add(new LabeledSentence(s.getLanguage(), + t.length() > maxChars ? t.substring(0, maxChars) : t)); + } + return result; + } + + private static List<LabeledSentence> samplePerLang( + List<LabeledSentence> data, int max) { + Map<String, Integer> counts = new HashMap<>(); + List<LabeledSentence> result = new ArrayList<>(); + for (LabeledSentence s : data) { + int n = counts.merge(s.getLanguage(), 1, Integer::sum); + if (n <= max) { + result.add(s); + } + } + return result; + } + + private static void printUsage() { + System.err.println("Usage: EvalGenerativeModel"); + System.err.println(" --model <generative.bin>"); + System.err.println(" --test <testFile.tsv>"); + System.err.println(" [--max-per-lang <N>]"); + System.err.println(" [--lengths 50,100,200] (truncate sentences to N chars)"); + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java new file mode 100644 index 0000000000..8143c1376f --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/FilterBenchmark.java @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.List; +import java.util.Locale; + +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; + +/** + * Evaluates the generative model as a training-data filter for a single + * target language. + * + * <p>Constructs a synthetic contaminated corpus from a FLORES-200 TSV: + * all sentences labelled as {@code --lang} are <em>signal</em> (should be + * kept); sentences from all other languages present in the model are + * <em>noise</em> (should be dropped). + * + * <p>The filter z-scores each sentence against the target language's model: + * <pre> + * z = (score(sentence, targetLang) - μ) / σ + * keep = z >= threshold + * </pre> + * where μ and σ are the mean/stddev of scores on the language's training + * corpus (baked into the model file). + * + * <p>Sweeping {@code threshold} traces a precision/recall curve: lower + * threshold = permissive filter (keeps more, misses more noise); higher + * threshold = strict filter (drops more noise but may drop real signal). + * + * <h3>Usage</h3> + * <pre> + * mvn -pl tika-langdetect/tika-langdetect-charsoup exec:java \ + * -Dexec.mainClass=...tools.FilterBenchmark \ + * -Dexec.args="--model /path/generative.bin \ + * --test /path/flores200_dev.tsv \ + * --lang zho \ + * [--noise-ratio 1.0] \ + * [--steps 20]" + * </pre> + */ +public class FilterBenchmark { + + private static final double DEFAULT_NOISE_RATIO = 1.0; + private static final int DEFAULT_STEPS = 20; + + public static void main(String[] args) throws Exception { + Path modelPath = null; + Path testPath = null; + String targetLang = null; + double noiseRatio = DEFAULT_NOISE_RATIO; + int steps = DEFAULT_STEPS; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--test": + testPath = Paths.get(args[++i]); + break; + case "--lang": + targetLang = args[++i]; + break; + case "--noise-ratio": + noiseRatio = Double.parseDouble(args[++i]); + break; + case "--steps": + steps = Integer.parseInt(args[++i]); + break; + default: + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + } + + if (modelPath == null || testPath == null || targetLang == null) { + System.err.println( + "Usage: FilterBenchmark --model <bin> --test <flores.tsv> --lang <code> " + + "[--noise-ratio 1.0] [--steps 20]"); + System.exit(1); + } + + System.out.println("Loading model: " + modelPath); + GenerativeLanguageModel model; + try (InputStream is = new FileInputStream(modelPath.toFile())) { + model = GenerativeLanguageModel.load(is); + } + + if (!model.getLanguages().contains(targetLang)) { + System.err.printf(Locale.US, + "Language '%s' not found in model. Available: %s%n", + targetLang, model.getLanguages()); + System.exit(1); + } + + System.out.println("Loading test data: " + testPath); + List<LabeledSentence> all = EvalGenerativeModel.loadTestFile(testPath); + + List<String> signal = new ArrayList<>(); + List<String> noise = new ArrayList<>(); + for (LabeledSentence s : all) { + String lang = EvalGenerativeModel.normalizeLang(s.getLanguage()); + if (targetLang.equals(lang)) { + signal.add(s.getText()); + } else { + // include all other languages as potential noise + noise.add(s.getText()); + } + } + + int noiseCount = (int) Math.min(noise.size(), + Math.round(signal.size() * noiseRatio)); + noise = noise.subList(0, noiseCount); + + System.out.printf(Locale.US, + "Target: %s | signal: %,d | noise: %,d (%.1fx ratio)%n%n", + targetLang, signal.size(), noiseCount, + (double) noiseCount / signal.size()); + + // Z-score every sentence against the target language model + float[] sigZ = zScores(model, targetLang, signal); + float[] noiseZ = zScores(model, targetLang, noise); + + // Sweep range: span the full observed z-score distribution + float minZ = Float.MAX_VALUE; + float maxZ = -Float.MAX_VALUE; + for (float z : sigZ) { + if (!Float.isNaN(z)) { + minZ = Math.min(minZ, z); + maxZ = Math.max(maxZ, z); + } + } + for (float z : noiseZ) { + if (!Float.isNaN(z)) { + minZ = Math.min(minZ, z); + maxZ = Math.max(maxZ, z); + } + } + + System.out.printf(Locale.US, + "Z-score range: [%.2f, %.2f] " + + "(std devs from '%s' training mean)%n%n", + minZ, maxZ, targetLang); + + System.out.printf(Locale.US, + "%-12s %8s %8s %8s %9s %10s %10s%n", + "Z-thresh", "Prec", "Recall", "F1", + "SigKept%", "NoiseDrop%", "FalseDrop%"); + System.out.println("-".repeat(76)); + + float stepSize = (maxZ - minZ) / steps; + for (int i = 0; i <= steps; i++) { + float threshold = minZ + i * stepSize; + printRow(threshold, sigZ, noiseZ, signal.size()); + } + } + + private static void printRow(float threshold, + float[] sigScores, + float[] noiseScores, + int signalSize) { + int tp = 0; // noise correctly dropped (score < threshold) + int fn = 0; // noise incorrectly kept (score >= threshold) + int fp = 0; // signal incorrectly dropped + int tn = 0; // signal correctly kept + + for (float s : noiseScores) { + if (Float.isNaN(s) || s < threshold) { + tp++; + } else { + fn++; + } + } + for (float s : sigScores) { + if (Float.isNaN(s) || s < threshold) { + fp++; + } else { + tn++; + } + } + + double precision = (tp + fp) > 0 + ? (double) tp / (tp + fp) : 1.0; + double recall = (tp + fn) > 0 + ? (double) tp / (tp + fn) : 0.0; + double f1 = (precision + recall) > 0 + ? 2 * precision * recall / (precision + recall) : 0.0; + double keptPct = 100.0 * tn / signalSize; + double noisePct = 100.0 * tp / noiseScores.length; + double falsePct = 100.0 * fp / signalSize; + + System.out.printf(Locale.US, + "%12.4f %8.3f %8.3f %8.3f %8.1f%% %9.1f%% %9.1f%%%n", + threshold, precision, recall, f1, + keptPct, noisePct, falsePct); + } + + /** Z-scores each sentence under the target language model. */ + private static float[] zScores(GenerativeLanguageModel model, + String targetLang, + List<String> sentences) { + float[] result = new float[sentences.size()]; + for (int i = 0; i < sentences.size(); i++) { + result[i] = model.zScore(sentences.get(i), targetLang); + } + return result; + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java new file mode 100644 index 0000000000..50dc658bdc --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/LengthCalibrationReport.java @@ -0,0 +1,171 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.BufferedReader; +import java.io.FileInputStream; +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.LinkedHashMap; +import java.util.Locale; +import java.util.Map; + +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; + +/** + * Measures how score mean and stddev vary with text length for selected + * languages. Used to decide whether z-scores need length normalization + * at runtime. + * + * <p>For each language, truncates training sentences to various character + * lengths, scores them, and reports per-bucket (μ, σ, n). If σ follows + * 1/√(charLen), a simple correction factor suffices at runtime. + * + * <h3>Usage</h3> + * <pre> + * java LengthCalibrationReport \ + * --model generative.bin \ + * --corpus /path/to/pool_filtered \ + * --langs eng,fra,zho,jpn,ara,kor \ + * [--max-per-lang 50000] + * </pre> + */ +public class LengthCalibrationReport { + + private static final int DEFAULT_MAX = 50_000; + private static final int[] CHAR_LENGTHS = {10, 20, 30, 50, 75, 100, 150, 200, 500, 99999}; + + public static void main(String[] args) throws Exception { + Path modelPath = null; + Path corpusPath = null; + String langsArg = "eng,fra,zho,jpn,ara"; + int max = DEFAULT_MAX; + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--model": + modelPath = Paths.get(args[++i]); + break; + case "--corpus": + corpusPath = Paths.get(args[++i]); + break; + case "--langs": + langsArg = args[++i]; + break; + case "--max-per-lang": + max = Integer.parseInt(args[++i]); + break; + default: + System.err.println("Unknown option: " + args[i]); + System.exit(1); + } + } + + if (modelPath == null || corpusPath == null) { + System.err.println( + "Usage: LengthCalibrationReport --model <bin> --corpus <dir> " + + "[--langs eng,fra,zho] [--max-per-lang 50000]"); + System.exit(1); + } + + GenerativeLanguageModel model; + try (InputStream is = new FileInputStream(modelPath.toFile())) { + model = GenerativeLanguageModel.load(is); + } + + String[] langs = langsArg.split(","); + + for (String lang : langs) { + lang = lang.trim(); + if (!model.getLanguages().contains(lang)) { + System.err.println("Skipping unknown language: " + lang); + continue; + } + + Path langFile = corpusPath.resolve(lang); + if (!Files.exists(langFile)) { + System.err.println("No corpus file for: " + lang); + continue; + } + + System.out.printf(Locale.US, "%n=== %s ===%n", lang); + System.out.printf(Locale.US, + "%-10s %8s %10s %10s %12s %12s%n", + "MaxChars", "N", "μ(score)", "σ(score)", + "σ*√(len/50)", "μ(z-full)"); + System.out.println("-".repeat(70)); + + // Read sentences once + String[] sentences = readSentences(langFile, max); + + for (int maxLen : CHAR_LENGTHS) { + // Welford's online algorithm + long n = 0; + double mean = 0.0; + double m2 = 0.0; + double zSum = 0.0; + + for (String sentence : sentences) { + String text = sentence.length() > maxLen + ? sentence.substring(0, maxLen) : sentence; + float score = model.score(text, lang); + if (Float.isNaN(score)) { + continue; + } + n++; + double delta = score - mean; + mean += delta / n; + m2 += delta * (score - mean); + + float z = model.zScore(text, lang); + if (!Float.isNaN(z)) { + zSum += z; + } + } + + double stdDev = n > 1 ? Math.sqrt(m2 / (n - 1)) : 0.0; + // If σ ~ 1/√len, then σ*√(len/50) should be roughly constant + double normalized = stdDev * Math.sqrt((double) Math.min(maxLen, 200) / 50.0); + double meanZ = n > 0 ? zSum / n : 0.0; + + String label = maxLen >= 99999 ? "full" : String.valueOf(maxLen); + System.out.printf(Locale.US, + "%-10s %,8d %10.4f %10.4f %12.4f %12.4f%n", + label, n, mean, stdDev, normalized, meanZ); + } + } + } + + private static String[] readSentences(Path file, int max) throws Exception { + Map<Integer, String> lines = new LinkedHashMap<>(); + try (BufferedReader reader = Files.newBufferedReader( + file, StandardCharsets.UTF_8)) { + String line; + int idx = 0; + while ((line = reader.readLine()) != null && idx < max) { + String text = line.trim(); + if (!text.isEmpty()) { + lines.put(idx++, text); + } + } + } + return lines.values().toArray(new String[0]); + } +} diff --git a/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java new file mode 100644 index 0000000000..ec7c6df728 --- /dev/null +++ b/tika-langdetect/tika-langdetect-charsoup/src/test/java/org/apache/tika/langdetect/charsoup/tools/TrainGenerativeLanguageModel.java @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.langdetect.charsoup.tools; + +import java.io.BufferedReader; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.OutputStream; +import java.nio.charset.StandardCharsets; +import java.nio.file.DirectoryStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; +import java.util.Locale; + +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; +import org.apache.tika.langdetect.charsoup.ScriptAwareFeatureExtractor; + +/** + * Trains a {@link GenerativeLanguageModel} from a Leipzig-format corpus. + * + * <h3>Corpus format</h3> + * <pre> + * corpusDir/ + * eng/ + * sentences.txt (lineNum TAB sentence) + * zho/ + * sentences.txt + * jpn/ + * sentences.txt + * ... + * </pre> + * Each directory name is used as the language code. Any {@code .txt} file + * directly under a language directory is read; each line must contain at + * least one tab, and the text after the first tab is the sentence. + * + * <h3>CJK detection</h3> + * A language is treated as CJK if at least 60% of the letter codepoints + * in a random sample of sentences are CJK/kana characters. You can + * override this with an explicit {@code --cjk} list on the command line. + * + * <h3>Usage</h3> + * <pre> + * java TrainGenerativeLanguageModel \ + * --corpus /path/to/Leipzig-corpus \ + * --output generative.bin \ + * [--max-per-lang 500000] \ + * [--add-k 0.01] \ + * [--cjk zho,jpn,cmn] + * </pre> + */ +public class TrainGenerativeLanguageModel { + + private static final int DEFAULT_MAX_PER_LANG = 500_000; + private static final float DEFAULT_ADD_K = 0.01f; + /** Fraction of letter codepoints that must be CJK to classify a language as CJK. */ + private static final float CJK_LETTER_THRESHOLD = 0.60f; + /** Number of sentences used to probe the script of an unknown language. */ + private static final int CJK_PROBE_SENTENCES = 500; + + public static void main(String[] args) throws Exception { + Path corpus = null; + Path output = null; + int maxPerLang = DEFAULT_MAX_PER_LANG; + float addK = DEFAULT_ADD_K; + List<String> forceCjk = new ArrayList<>(); + + for (int i = 0; i < args.length; i++) { + switch (args[i]) { + case "--corpus": + corpus = Paths.get(args[++i]); + break; + case "--output": + output = Paths.get(args[++i]); + break; + case "--max-per-lang": + maxPerLang = Integer.parseInt(args[++i]); + break; + case "--add-k": + addK = Float.parseFloat(args[++i]); + break; + case "--cjk": { + for (String code : args[++i].split(",")) { + forceCjk.add(code.trim()); + } + break; + } + default: + System.err.println("Unknown option: " + args[i]); + printUsage(); + System.exit(1); + } + } + + if (corpus == null || output == null) { + printUsage(); + System.exit(1); + } + + new TrainGenerativeLanguageModel().run(corpus, output, maxPerLang, addK, forceCjk); + } + + private void run(Path corpusDir, Path outputPath, + int maxPerLang, float addK, + List<String> forceCjkList) throws IOException { + + // Support two corpus layouts: + // flat: corpusDir/{langCode} (one sentence per line, no tab prefix) + // Leipzig: corpusDir/{langCode}/*.txt (lineNum TAB sentence) + boolean flatLayout = isFlatLayout(corpusDir); + System.out.printf(Locale.US, "Corpus layout: %s%n", flatLayout ? "flat" : "Leipzig"); + + List<Path> langPaths = listLangPaths(corpusDir, flatLayout); + System.out.printf(Locale.US, "Found %d languages in %s%n", langPaths.size(), corpusDir); + + GenerativeLanguageModel.Builder builder = GenerativeLanguageModel.builder(); + + for (Path langPath : langPaths) { + String lang = langPath.getFileName().toString(); + boolean cjk = forceCjkList.contains(lang) + || probeCjk(langPath, flatLayout, CJK_PROBE_SENTENCES); + + System.out.printf(Locale.US, " %-12s %s%n", lang, cjk ? "CJK" : "non-CJK"); + builder.registerLanguage(lang, cjk); + } + + System.out.println("Accumulating n-gram counts …"); + long totalSentences = 0; + + for (Path langPath : langPaths) { + String lang = langPath.getFileName().toString(); + long counted = feedLanguage(builder, lang, langPath, flatLayout, maxPerLang); + totalSentences += counted; + System.out.printf(Locale.US, " %-12s %,d sentences%n", lang, counted); + } + + System.out.printf(Locale.US, "Total sentences: %,d%n", totalSentences); + System.out.printf(Locale.US, "Building model (add-k=%.4f) …%n", addK); + + GenerativeLanguageModel model = builder.build(addK); + + // Second pass: score training data to compute per-language μ and σ + System.out.println("Calibrating z-scores (second pass) …"); + for (Path langPath : langPaths) { + String lang = langPath.getFileName().toString(); + double[] stats = calibrateLanguage(model, lang, langPath, flatLayout, maxPerLang); + model.setStats(lang, (float) stats[0], (float) stats[1]); + System.out.printf(Locale.US, + " %-12s μ=%8.4f σ=%6.4f (n=%d)%n", + lang, stats[0], stats[1], (long) stats[2]); + } + + System.out.printf(Locale.US, "Writing model to %s …%n", outputPath); + try (OutputStream os = new FileOutputStream(outputPath.toFile())) { + model.save(os); + } + + long bytes = Files.size(outputPath); + System.out.printf(Locale.US, "Done. Model size: %,.0f KB%n", bytes / 1024.0); + } + + // ---- Corpus helpers ---- + + /** + * Returns true if the corpus uses the flat layout (files named by language + * code, one sentence per line) rather than the Leipzig layout (subdirectories + * containing {@code *.txt} files with {@code lineNum TAB sentence} lines). + */ + private static boolean isFlatLayout(Path corpusDir) throws IOException { + try (DirectoryStream<Path> stream = Files.newDirectoryStream(corpusDir)) { + for (Path p : stream) { + return Files.isRegularFile(p); + } + } + return true; + } + + /** + * List all language paths in the corpus directory, sorted. + * For flat layout: regular files. For Leipzig layout: subdirectories. + */ + private static List<Path> listLangPaths(Path corpusDir, + boolean flat) throws IOException { + List<Path> paths = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream(corpusDir, + p -> flat ? Files.isRegularFile(p) : Files.isDirectory(p))) { + for (Path p : stream) { + paths.add(p); + } + } + Collections.sort(paths); + return paths; + } + + /** + * Feed up to {@code maxPerLang} sentences from {@code langPath} into the builder. + * + * @return number of sentences consumed + */ + private static long feedLanguage(GenerativeLanguageModel.Builder builder, + String lang, Path langPath, + boolean flat, + int maxPerLang) throws IOException { + long count = 0; + if (flat) { + try (BufferedReader reader = Files.newBufferedReader(langPath, + StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + String text = line.trim(); + if (text.isEmpty()) { + continue; + } + builder.addSample(lang, text); + count++; + if (maxPerLang > 0 && count >= maxPerLang) { + break; + } + } + } + } else { + List<Path> files = listTxtFiles(langPath); + outer: + for (Path file : files) { + try (BufferedReader reader = Files.newBufferedReader(file, + StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + int tab = line.indexOf('\t'); + if (tab < 0) { + continue; + } + String text = line.substring(tab + 1).trim(); + if (text.isEmpty()) { + continue; + } + builder.addSample(lang, text); + count++; + if (maxPerLang > 0 && count >= maxPerLang) { + break outer; + } + } + } + } + } + return count; + } + + /** + * Score every training sentence for {@code lang} against the built model + * and return {@code [mean, stdDev, count]} using Welford's online algorithm. + */ + private static double[] calibrateLanguage( + GenerativeLanguageModel model, String lang, + Path langPath, boolean flat, int maxPerLang) throws IOException { + long n = 0; + double mean = 0.0; + double m2 = 0.0; + + if (flat) { + try (BufferedReader reader = Files.newBufferedReader( + langPath, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + String text = line.trim(); + if (text.isEmpty()) { + continue; + } + float s = model.score(text, lang); + if (Float.isNaN(s)) { + continue; + } + n++; + double delta = s - mean; + mean += delta / n; + m2 += delta * (s - mean); + if (maxPerLang > 0 && n >= maxPerLang) { + break; + } + } + } + } else { + List<Path> files = listTxtFiles(langPath); + outer: + for (Path file : files) { + try (BufferedReader reader = Files.newBufferedReader( + file, StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + int tab = line.indexOf('\t'); + if (tab < 0) { + continue; + } + String text = line.substring(tab + 1).trim(); + if (text.isEmpty()) { + continue; + } + float s = model.score(text, lang); + if (Float.isNaN(s)) { + continue; + } + n++; + double delta = s - mean; + mean += delta / n; + m2 += delta * (s - mean); + if (maxPerLang > 0 && n >= maxPerLang) { + break outer; + } + } + } + } + } + + double stdDev = n > 1 ? Math.sqrt(m2 / (n - 1)) : 0.0; + return new double[]{mean, stdDev, n}; + } + + /** + * Probe a language path to decide whether it is CJK. + */ + private static boolean probeCjk(Path langPath, boolean flat, + int maxSentences) throws IOException { + long cjkLetters = 0; + long totalLetters = 0; + int sentences = 0; + + List<Path> files = flat + ? Collections.singletonList(langPath) : listTxtFiles(langPath); + + outer: + for (Path file : files) { + try (BufferedReader reader = Files.newBufferedReader(file, + StandardCharsets.UTF_8)) { + String line; + while ((line = reader.readLine()) != null) { + String text; + if (flat) { + text = line.trim(); + } else { + int tab = line.indexOf('\t'); + if (tab < 0) continue; + text = line.substring(tab + 1); + } + if (text.isEmpty()) continue; + int i = 0; + while (i < text.length()) { + int cp = text.codePointAt(i); + i += Character.charCount(cp); + if (Character.isLetter(cp)) { + totalLetters++; + if (ScriptAwareFeatureExtractor.isCjkOrKana( + Character.toLowerCase(cp))) { + cjkLetters++; + } + } + } + sentences++; + if (sentences >= maxSentences) { + break outer; + } + } + } + } + + if (totalLetters == 0) { + return false; + } + return (double) cjkLetters / totalLetters >= CJK_LETTER_THRESHOLD; + } + + private static List<Path> listTxtFiles(Path dir) throws IOException { + List<Path> files = new ArrayList<>(); + try (DirectoryStream<Path> stream = Files.newDirectoryStream(dir, "*.txt")) { + for (Path p : stream) { + files.add(p); + } + } + Collections.sort(files); + return files; + } + + private static void printUsage() { + System.err.println("Usage: TrainGenerativeLanguageModel"); + System.err.println(" --corpus <corpusDir>"); + System.err.println(" --output <outputFile>"); + System.err.println(" [--max-per-lang <N>] (default 500000)"); + System.err.println(" [--add-k <k>] (default 0.01)"); + System.err.println(" [--cjk lang1,lang2,...] (override auto-detection)"); + } +} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java index 31421a12c9..065dec2cad 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaEncodingDetectorTest.java @@ -17,6 +17,7 @@ package org.apache.tika.config; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; @@ -35,6 +36,7 @@ import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.detect.BOMDetector; import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.EncodingDetector; +import org.apache.tika.detect.EncodingResult; import org.apache.tika.detect.MetaEncodingDetector; import org.apache.tika.detect.MetadataCharsetDetector; import org.apache.tika.detect.OverrideEncodingDetector; @@ -46,6 +48,7 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.ml.chardetect.MojibusterEncodingDetector; import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.html.HtmlEncodingDetector; @@ -294,6 +297,33 @@ public class TikaEncodingDetectorTest extends TikaTest { } } + // ----------------------------------------------------------------------- + // Solr integration-test regression (TIKA-4662) + // ----------------------------------------------------------------------- + + /** + * ASCII HTML with an explicit {@code <meta charset="UTF-8">} must be + * detected as UTF-8. The full detection chain is required: the HTML + * detector produces a DECLARATIVE UTF-8 result; CharSoupEncodingDetector + * sees that both UTF-8 and the statistical winner (windows-1252) decode + * the pure-ASCII bytes identically and therefore defers to the declaration. + */ + @Test + public void testAsciiHtmlWithMetaIsDetectedAsUtf8() throws Exception { + byte[] bytes = + "<html><head><meta charset=\"UTF-8\"></head><body>initial</body></html>" + .getBytes(StandardCharsets.UTF_8); + EncodingDetector detector = TikaLoader.loadDefault().loadEncodingDetectors(); + try (TikaInputStream tis = TikaInputStream.get(bytes)) { + List<EncodingResult> results = + detector.detect(tis, new Metadata(), new ParseContext()); + assertFalse(results.isEmpty(), "detector returned no result for ASCII HTML with meta"); + assertEquals(StandardCharsets.UTF_8, results.get(0).getCharset(), + "ASCII HTML with <meta charset=UTF-8> should be detected as UTF-8, got: " + + results.get(0).getCharset().name()); + } + } + @Test public void testArabicMisleadingCharsetHtml() throws Exception { // This HTML file is encoded in windows-1256 but declares charset=UTF-8
