This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4690-add-generative-models
in repository https://gitbox.apache.org/repos/asf/tika.git

commit c9d96d81a7dc1d5d5919520e65b79f1075470e8c
Merge: a15df1625d 81c3282368
Author: tballison <[email protected]>
AuthorDate: Tue Mar 10 16:02:48 2026 -0400

    Merge branch 'main' into add-generative-models
    
    Made-with: Cursor
    
    # Conflicts:
    #       
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java

 .../advanced/charsoup-supported-languages.adoc     |   192 +
 .../advanced/lang-detection/flores-AUTOMATIC.log   |   536 +
 .../advanced/lang-detection/flores-SHORT_TEXT.log  |   414 +
 .../advanced/lang-detection/flores-STANDARD.log    |   488 +
 .../advanced/lang-detection/flores200-dev-eval.md  |   204 +
 .../lang-detection/language-drop-decisions.md      |   175 +
 .../short-text-language-decisions.md               |   384 +
 .../advanced/lang-detection/supported-languages.md |   240 +
 .../pages/advanced/language-detection-build.adoc   |   647 +-
 .../ROOT/pages/advanced/language-detection.adoc    |   264 +-
 .../pages/configuration/encoding-detectors.adoc    |   203 +
 docs/modules/ROOT/pages/configuration/index.adoc   |     1 +
 tika-app/pom.xml                                   |     2 +-
 tika-bom/pom.xml                                   |    10 -
 tika-bundles/pom.xml                               |     1 -
 tika-bundles/tika-bundle-standard/pom.xml          |     1 -
 .../tika/language/detect/LanguageDetector.java     |    21 +-
 .../charsoup/CharSoupEncodingDetector.java         |    16 +-
 .../tika-encoding-detector-mojibuster/pom.xml      |     2 +-
 .../ml/chardetect/ZipFilenameDetectionTest.java    |     8 +
 .../apache/tika/eval/app/SimpleComparerTest.java   |    18 +-
 .../resources/test-dirs/extractsB/file1.pdf.json   |     2 +-
 .../src/main/resources/common_tokens/ace           |  1578 +
 .../src/main/resources/common_tokens/afr           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/aka           |  9272 +++
 .../src/main/resources/common_tokens/alt           |  2668 +
 .../src/main/resources/common_tokens/amh           | 19139 +++---
 .../src/main/resources/common_tokens/ami           |  2574 +
 .../src/main/resources/common_tokens/ara           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/arg           | 32637 +++++++---
 .../src/main/resources/common_tokens/arz           | 17123 ------
 .../src/main/resources/common_tokens/asm           | 10793 ++--
 .../src/main/resources/common_tokens/ast           | 30020 ----------
 .../src/main/resources/common_tokens/ava           |  2453 +
 .../src/main/resources/common_tokens/avk           |  6710 +++
 .../src/main/resources/common_tokens/azb           | 26484 ++++++++
 .../src/main/resources/common_tokens/aze           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/bak           | 59988 +++++++++---------
 .../src/main/resources/common_tokens/bam           |  1192 -
 .../src/main/resources/common_tokens/ban           | 12613 ++--
 .../src/main/resources/common_tokens/bar           | 31854 +++++-----
 .../src/main/resources/common_tokens/bcl           | 14470 ++++-
 .../src/main/resources/common_tokens/be-x-old      | 30020 ++++++++++
 .../src/main/resources/common_tokens/bel           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ben           | 44061 +++++---------
 .../src/main/resources/common_tokens/bjn           |  8705 +--
 .../src/main/resources/common_tokens/bos           | 30020 ----------
 .../src/main/resources/common_tokens/bpy           |  1001 -
 .../src/main/resources/common_tokens/bre           | 47871 +++++++++------
 .../src/main/resources/common_tokens/bua           |  2734 -
 .../src/main/resources/common_tokens/bul           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/bxr           |  3556 ++
 .../src/main/resources/common_tokens/cat           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/cdo-x-rom     |   633 +
 .../src/main/resources/common_tokens/ceb           | 60002 +++++++++---------
 .../src/main/resources/common_tokens/ces           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/che           | 36578 +++++++++--
 .../src/main/resources/common_tokens/chv           | 24222 +++++---
 .../src/main/resources/common_tokens/ckb           | 46397 ++++++++------
 .../src/main/resources/common_tokens/cnh           | 17224 ++++++
 .../src/main/resources/common_tokens/cor           |  3558 ++
 .../src/main/resources/common_tokens/cos           | 10474 +++-
 .../src/main/resources/common_tokens/csb           |  3585 +-
 .../src/main/resources/common_tokens/cym           | 50185 +++++++++-------
 .../src/main/resources/common_tokens/dag           |  4433 ++
 .../src/main/resources/common_tokens/dan           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/deu           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/diq           | 11793 ++--
 .../src/main/resources/common_tokens/div           | 31671 +---------
 .../src/main/resources/common_tokens/dsb           |  3542 +-
 .../src/main/resources/common_tokens/ell           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/eml           |  5000 --
 .../src/main/resources/common_tokens/eng           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/epo           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/est           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/eus           | 59994 +++++++++---------
 .../src/main/resources/common_tokens/ewe           | 12375 +++-
 .../src/main/resources/common_tokens/ext           |  5545 +-
 .../src/main/resources/common_tokens/fao           | 39567 +++---------
 .../src/main/resources/common_tokens/fas           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/fin           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/fra           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/frr           |  6648 +-
 .../src/main/resources/common_tokens/fry           | 59942 +++++++++---------
 .../src/main/resources/common_tokens/gla           |  6667 ++
 .../src/main/resources/common_tokens/gle           | 52183 ++++++++--------
 .../src/main/resources/common_tokens/glg           | 59850 +++++++++---------
 .../src/main/resources/common_tokens/glv           |  6204 +-
 .../src/main/resources/common_tokens/gom           | 13378 ++---
 .../src/main/resources/common_tokens/grn           |  7802 +--
 .../src/main/resources/common_tokens/gsw           | 59486 +++++++++---------
 .../src/main/resources/common_tokens/guj           | 24230 ++------
 .../src/main/resources/common_tokens/hak-x-rom     |   693 +
 .../src/main/resources/common_tokens/hat           |  6627 --
 .../src/main/resources/common_tokens/hau           | 29608 ++++++---
 .../src/main/resources/common_tokens/hbs           | 30020 ----------
 .../src/main/resources/common_tokens/heb           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hif           |  2137 -
 .../src/main/resources/common_tokens/hil           | 18496 ++++++
 .../src/main/resources/common_tokens/hin           | 36421 ++---------
 .../src/main/resources/common_tokens/hrv           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hsb           | 13309 ++--
 .../src/main/resources/common_tokens/hun           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hye           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/hyw           | 19125 ++++++
 .../src/main/resources/common_tokens/ibo           | 19379 +++++-
 .../src/main/resources/common_tokens/ido           | 21997 ++++---
 .../src/main/resources/common_tokens/ile           |  4013 +-
 .../src/main/resources/common_tokens/ilo           | 10135 ++--
 .../src/main/resources/common_tokens/ina           | 15743 ++---
 .../src/main/resources/common_tokens/ind           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/isl           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ita           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/jav           | 59433 +++++++++---------
 .../src/main/resources/common_tokens/jbo           |  1071 +
 .../src/main/resources/common_tokens/jpn           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/kaa           |  4617 ++
 .../src/main/resources/common_tokens/kab           |  3263 +
 .../src/main/resources/common_tokens/kal           |  6341 --
 .../src/main/resources/common_tokens/kan           | 58858 +++++++++---------
 .../src/main/resources/common_tokens/kat           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/kaz           | 59998 +++++++++---------
 .../src/main/resources/common_tokens/kha           |  9653 +++
 .../src/main/resources/common_tokens/khk           |  4187 --
 .../src/main/resources/common_tokens/khm           |  8745 +++
 .../src/main/resources/common_tokens/kin           | 15225 ++---
 .../src/main/resources/common_tokens/kir           | 60004 +++++++++---------
 .../src/main/resources/common_tokens/koi           |  1373 -
 .../src/main/resources/common_tokens/kom           |  2382 -
 .../src/main/resources/common_tokens/kor           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/kpv           |  3445 ++
 .../src/main/resources/common_tokens/krc           |  1974 -
 .../src/main/resources/common_tokens/ksh           |  5117 +-
 .../src/main/resources/common_tokens/kur           | 32152 ++++++----
 .../src/main/resources/common_tokens/lad           |  1681 -
 .../src/main/resources/common_tokens/lao           |  1479 +-
 .../src/main/resources/common_tokens/lat           | 59988 +++++++++---------
 .../src/main/resources/common_tokens/lav           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/lez           |  3913 ++
 .../src/main/resources/common_tokens/lfn           |  5582 ++
 .../src/main/resources/common_tokens/lim           | 48870 ++++++---------
 .../src/main/resources/common_tokens/lit           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/lmo           |  6924 ---
 .../src/main/resources/common_tokens/ltz           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/lug           | 35119 ++---------
 .../src/main/resources/common_tokens/lup           |   905 -
 .../src/main/resources/common_tokens/lus           | 40004 ++++++++----
 .../src/main/resources/common_tokens/mai           |   755 -
 .../src/main/resources/common_tokens/mal           | 59240 +++++++++---------
 .../src/main/resources/common_tokens/mar           | 39628 +++---------
 .../src/main/resources/common_tokens/mhr           |  9769 +--
 .../src/main/resources/common_tokens/min           | 29209 +++++----
 .../src/main/resources/common_tokens/mkd           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/mlg           | 38614 +++++++++---
 .../src/main/resources/common_tokens/mlt           | 47776 ++++++---------
 .../src/main/resources/common_tokens/mon           | 57279 +++++++++---------
 .../src/main/resources/common_tokens/mri           |  9729 ---
 .../src/main/resources/common_tokens/mrj           |  3902 +-
 .../src/main/resources/common_tokens/msa           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/mwl           | 25861 ++++----
 .../src/main/resources/common_tokens/mya           | 30020 ++++++++++
 .../src/main/resources/common_tokens/myv           |  3154 +-
 .../src/main/resources/common_tokens/mzn           | 10405 ++--
 .../src/main/resources/common_tokens/nan           |  6673 ---
 .../src/main/resources/common_tokens/nap           |  2039 -
 .../src/main/resources/common_tokens/nav           |   533 -
 .../src/main/resources/common_tokens/ndo           |  3142 -
 .../src/main/resources/common_tokens/nds           | 54491 +++++++++--------
 .../src/main/resources/common_tokens/nep           | 35836 ++---------
 .../src/main/resources/common_tokens/new           |  2545 -
 .../src/main/resources/common_tokens/nld           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/nno           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/nob           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/nqo           |  2779 +
 .../src/main/resources/common_tokens/nso           |  5150 +-
 .../src/main/resources/common_tokens/nya           | 30020 ++++++++++
 .../src/main/resources/common_tokens/olo           |  2220 +
 .../src/main/resources/common_tokens/ori           | 12884 ++--
 .../src/main/resources/common_tokens/orm           | 31991 +++++++++-
 .../src/main/resources/common_tokens/oss           |  7575 ++-
 .../src/main/resources/common_tokens/pam           |  6310 +-
 .../src/main/resources/common_tokens/pan           | 11564 +---
 .../src/main/resources/common_tokens/pap           | 12598 ++--
 .../src/main/resources/common_tokens/pfl           |  4325 +-
 .../src/main/resources/common_tokens/pms           |  6552 --
 .../src/main/resources/common_tokens/pnb           | 57576 +++++++++---------
 .../src/main/resources/common_tokens/pol           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/por           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/prs           | 12167 ----
 .../src/main/resources/common_tokens/pus           | 51255 +++++++++-------
 .../src/main/resources/common_tokens/que           |  2170 -
 .../src/main/resources/common_tokens/roh           | 33539 ++++-------
 .../src/main/resources/common_tokens/ron           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/rue           |  6615 +-
 .../src/main/resources/common_tokens/run           |  3534 --
 .../src/main/resources/common_tokens/rus           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sah           | 31037 +++++-----
 .../src/main/resources/common_tokens/san           | 14998 ++---
 .../src/main/resources/common_tokens/sat           |  6387 ++
 .../src/main/resources/common_tokens/scn           |  7559 ---
 .../src/main/resources/common_tokens/sco           | 12070 ----
 .../src/main/resources/common_tokens/sgs           |  5547 +-
 .../src/main/resources/common_tokens/sin           | 34762 +++++------
 .../src/main/resources/common_tokens/skr           |  6326 ++
 .../src/main/resources/common_tokens/slk           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/slv           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sme           |  5891 +-
 .../src/main/resources/common_tokens/smi           |  1676 -
 .../src/main/resources/common_tokens/smn           |  2934 +
 .../src/main/resources/common_tokens/smo           | 24490 ++++++++
 .../src/main/resources/common_tokens/sna           | 29768 ++-------
 .../src/main/resources/common_tokens/snd           | 26767 +++++----
 .../src/main/resources/common_tokens/som           | 32074 +++-------
 .../src/main/resources/common_tokens/sot           |  3535 --
 .../src/main/resources/common_tokens/spa           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/sqi           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/srd           |  3796 --
 .../src/main/resources/common_tokens/srp           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/ssw           |  2035 -
 .../src/main/resources/common_tokens/stq           |  3423 ++
 .../src/main/resources/common_tokens/sun           | 45923 ++++++--------
 .../src/main/resources/common_tokens/swe           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/swh           | 28444 +++++++--
 .../src/main/resources/common_tokens/szl           |  8763 ++-
 .../src/main/resources/common_tokens/szy           |  4825 ++
 .../src/main/resources/common_tokens/tam           | 44673 +++++---------
 .../src/main/resources/common_tokens/tat           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/tay           |  1220 +
 .../src/main/resources/common_tokens/tel           | 44436 +++++---------
 .../src/main/resources/common_tokens/tet           | 24501 ++++++++
 .../src/main/resources/common_tokens/tgk           | 57085 +++++++++---------
 .../src/main/resources/common_tokens/tgl           | 59972 +++++++++---------
 .../src/main/resources/common_tokens/tha           | 53250 +++++++---------
 .../src/main/resources/common_tokens/tir           | 30020 ++++++++++
 .../src/main/resources/common_tokens/trv           |  3333 +
 .../src/main/resources/common_tokens/tsn           | 17815 ++++--
 .../src/main/resources/common_tokens/tso           | 15810 +++--
 .../src/main/resources/common_tokens/tuk           | 30868 ++++------
 .../src/main/resources/common_tokens/tum           |  4881 ++
 .../src/main/resources/common_tokens/tur           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/tyv           |  8541 ++-
 .../src/main/resources/common_tokens/udm           | 31389 +++++++++-
 .../src/main/resources/common_tokens/uig           | 31403 +++++-----
 .../src/main/resources/common_tokens/ukr           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/urd           | 60006 +++++++++---------
 .../src/main/resources/common_tokens/uzb           | 59970 +++++++++---------
 .../src/main/resources/common_tokens/uzn           | 30020 ----------
 .../src/main/resources/common_tokens/ven           |  2457 -
 .../src/main/resources/common_tokens/vep           |  7276 +++
 .../src/main/resources/common_tokens/vie           | 51687 +++++++---------
 .../src/main/resources/common_tokens/vls           | 15292 ++---
 .../src/main/resources/common_tokens/vol           |  7216 +--
 .../src/main/resources/common_tokens/vro           |  4730 +-
 .../src/main/resources/common_tokens/war           | 55938 ++++++++---------
 .../src/main/resources/common_tokens/wln           | 12282 ++--
 .../src/main/resources/common_tokens/wuu           | 30020 ----------
 .../src/main/resources/common_tokens/xho           | 51611 +++++++++-------
 .../src/main/resources/common_tokens/xmf           | 15787 +++--
 .../src/main/resources/common_tokens/ydd           | 21613 ++++---
 .../src/main/resources/common_tokens/yor           |  7833 ++-
 .../src/main/resources/common_tokens/yue           |  9176 +++
 .../src/main/resources/common_tokens/zea           |  2318 -
 .../src/main/resources/common_tokens/zho           | 60008 +++++++++----------
 .../src/main/resources/common_tokens/zul           | 35021 ++---------
 .../core/metadata/TikaEvalMetadataFilterTest.java  |    13 +-
 .../tika/eval/core/textstats/TextStatsTest.java    |    28 +-
 .../core/tokens/tools/CommonTokenGenerator.java    |    92 +-
 tika-example/pom.xml                               |     2 +-
 .../java/org/apache/tika/example/Language.java     |     5 +-
 .../tika/example/LanguageDetectorExample.java      |     3 +-
 .../java/org/apache/tika/example/MyFirstTika.java  |     3 +-
 .../tika/example/LanguageDetectorExampleTest.java  |     2 +-
 .../tika/pipes/kafka/tests/TikaPipesKafkaTest.java |     5 +-
 tika-langdetect/pom.xml                            |     3 +-
 .../charsoup/CharSoupFeatureExtractor.java         |    12 +-
 .../tika/langdetect/charsoup/CharSoupModel.java    |   126 +-
 .../tika/langdetect/charsoup/FeatureExtractor.java |    39 +
 .../charsoup/ScriptAwareFeatureExtractor.java      |   248 +-
 .../tika/langdetect/charsoup/ScriptCategory.java   |    13 +-
 .../charsoup/ShortTextFeatureExtractor.java        |   348 +
 .../charsoup/langdetect-short-v1-20260310.bin      |   Bin 0 -> 3999308 bytes
 .../langdetect/charsoup/langdetect-v7-20260306.bin |   Bin 0 -> 3328628 bytes
 .../apache/tika/langdetect/charsoup/langdetect.bin |   Bin 1641016 -> 0 bytes
 tika-langdetect/tika-langdetect-charsoup/pom.xml   |    12 +
 .../charsoup/CharSoupDetectorConfig.java           |   120 +
 .../charsoup/CharSoupLanguageDetector.java         |   680 +-
 .../charsoup/CharSoupMetadataFilter.java           |    62 +
 .../tika/langdetect/charsoup/ConfusableGroups.java |    72 +
 .../src/main/python/extract_madlad_to_wiki.py      |   182 +
 .../tika/langdetect/charsoup/confusables.txt       |    52 +
 .../charsoup/CharSoupDetectorConfigTest.java       |   125 +
 .../charsoup/CharSoupFeatureExtractorTest.java     |     2 +-
 .../charsoup/CharSoupModelRoutingTest.java         |   281 +
 .../langdetect/charsoup/LangIdRegressionTest.java  |    25 +-
 .../charsoup/ScriptAwareFeatureExtractorTest.java  |    33 +-
 .../langdetect/charsoup/SjisLangSignalTest.java    |    59 +-
 .../langdetect/charsoup/tools/AblationRunner.java  |   652 +-
 .../charsoup/tools/BucketSaturationAnalyzer.java   |    76 +-
 .../charsoup/tools/CalibrateConfidence.java        |   217 +
 .../charsoup/tools/CompareDetectors.java           |  1470 +-
 .../langdetect/charsoup/tools/ConfusionDump.java   |   184 +
 .../langdetect/charsoup/tools/CorpusReader.java    |    73 +-
 .../langdetect/charsoup/tools/CrossDomainEval.java |   309 +-
 .../charsoup/tools/DiagnoseUnknownScript.java      |   145 +
 .../charsoup/tools/KoreanFalsePositives.java       |   146 +
 .../langdetect/charsoup/tools/ModelQuantizer.java  |    28 +-
 .../langdetect/charsoup/tools/Phase2SmokeTest.java |     2 +-
 .../langdetect/charsoup/tools/Phase2Trainer.java   |   102 +
 .../langdetect/charsoup/tools/PrepareCorpus.java   |   992 +
 .../langdetect/charsoup/tools/QuickF1Eval.java     |   134 +-
 .../charsoup/tools/ResearchFeatureExtractor.java   |   457 +
 .../charsoup/tools/TrainLanguageModel.java         |   982 +-
 .../langdetect/charsoup/tools/TrainShortModel.java |   179 +
 .../src/test/python/check_script_consistency.py    |   253 +
 .../src/test/python/clean_madlad.py                |   308 +
 .../src/test/python/collect_wikipedia.py           |   556 +
 .../src/test/python/diagnose_kor_eng.py            |   256 +
 .../src/test/python/eval_fasttext.py               |   290 +
 .../src/test/python/filter_contamination.py        |   240 +
 .../src/test/python/filter_pashto.py               |    66 +-
 .../src/test/python/filter_uppercase.py            |   145 +
 .../src/test/python/summarize_wikipedia.py         |   170 +
 tika-langdetect/tika-langdetect-tika/pom.xml       |    75 -
 .../tika/langdetect/tika/LanguageIdentifier.java   |   260 -
 .../tika/langdetect/tika/LanguageProfile.java      |   317 -
 .../langdetect/tika/LanguageProfilerBuilder.java   |   767 -
 .../tika/langdetect/tika/ProfilingWriter.java      |   103 -
 .../tika/langdetect/tika/TikaLanguageDetector.java |    92 -
 ...rg.apache.tika.language.detect.LanguageDetector |    16 -
 .../org/apache/tika/langdetect/tika/be.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ca.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/da.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/de.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/el.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/en.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/eo.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/es.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/et.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/fa.ngp         |  1015 -
 .../org/apache/tika/langdetect/tika/fi.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/fr.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/gl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/hu.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/is.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/it.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/lt.ngp         |  1209 -
 .../org/apache/tika/langdetect/tika/nl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/no.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/pl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/pt.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ro.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/ru.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sk.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sl.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/sv.ngp         |  1014 -
 .../org/apache/tika/langdetect/tika/th.ngp         |  1014 -
 .../tika/langdetect/tika/tika.language.properties  |    56 -
 .../org/apache/tika/langdetect/tika/uk.ngp         |  1014 -
 .../langdetect/tika/LanguageIdentifierTest.java    |   185 -
 .../tika/langdetect/tika/LanguageProfileTest.java  |    58 -
 .../tika/LanguageProfilerBuilderTest.java          |    96 -
 .../tika/langdetect/tika/ProfilingHandler.java     |    67 -
 .../tika/langdetect/tika/ProfilingWriterTest.java  |    45 -
 .../org/apache/tika/langdetect/tika/da.test        |   108 -
 .../org/apache/tika/langdetect/tika/de.test        |   104 -
 .../org/apache/tika/langdetect/tika/el.test        |   109 -
 .../org/apache/tika/langdetect/tika/en.test        |   105 -
 .../org/apache/tika/langdetect/tika/es.test        |   107 -
 .../org/apache/tika/langdetect/tika/et.test        |    17 -
 .../org/apache/tika/langdetect/tika/fi.test        |   106 -
 .../org/apache/tika/langdetect/tika/fr.test        |   105 -
 .../org/apache/tika/langdetect/tika/it.test        |   109 -
 .../langdetect/tika/langbuilder/welsh_corpus.txt   |  2602 -
 .../org/apache/tika/langdetect/tika/lt.test        |    32 -
 .../org/apache/tika/langdetect/tika/nl.test        |   105 -
 .../org/apache/tika/langdetect/tika/pt.test        |   105 -
 .../org/apache/tika/langdetect/tika/sv.test        |   108 -
 .../charsoup/CharSoupFeatureExtractor.java         |   456 -
 .../tika/langdetect/charsoup/ScriptCategory.java   |   117 -
 .../langdetect/charsoup/TextFeatureExtractor.java  |    59 -
 .../tika/langdetect/charsoup/WordTokenizer.java    |   225 -
 .../apache/tika/langdetect/charsoup/langdetect.bin |   Bin 1641016 -> 0 bytes
 .../apache/tika/parser/pkg/PackageParserTest.java  |     3 +
 tika-server/tika-server-core/pom.xml               |     2 +-
 .../server/core/resource/LanguageResource.java     |    24 +-
 .../server/core/resource/TranslateResource.java    |     4 +-
 .../tika/server/core/LanguageResourceTest.java     |     2 +-
 tika-server/tika-server-standard/pom.xml           |     6 +
 tika-translate/pom.xml                             |     2 +-
 .../translate/impl/AbstractTranslator.java         |    24 +-
 .../translate/impl/JoshuaNetworkTranslator.java    |     2 +-
 391 files changed, 3482069 insertions(+), 3445298 deletions(-)

diff --cc 
tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
index 2680c11a70,81871ae319..c97a6bdf6a
--- 
a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
+++ 
b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/ScriptAwareFeatureExtractor.java
@@@ -274,16 -325,12 +325,14 @@@ public class ScriptAwareFeatureExtracto
  
      private static boolean isSpace(int cp) {
          return cp == ' ' || cp == '\t'
-                 || Character.getType(cp)
-                 == Character.SPACE_SEPARATOR;
+                 || Character.getType(cp) == Character.SPACE_SEPARATOR;
      }
  
 -    static boolean isCjkOrKana(int cp) {
 -        if (Character.isIdeographic(cp)) return true;
 +    public static boolean isCjkOrKana(int cp) {
 +        if (Character.isIdeographic(cp)) {
 +            return true;
 +        }
-         Character.UnicodeScript us =
-                 Character.UnicodeScript.of(cp);
+         Character.UnicodeScript us = Character.UnicodeScript.of(cp);
          return us == Character.UnicodeScript.HIRAGANA
                  || us == Character.UnicodeScript.KATAKANA;
      }

Reply via email to