This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4690-add-generative-models in repository https://gitbox.apache.org/repos/asf/tika.git
commit 65cceb58cd9687bff0674a0d27948ebf8acb4195 Author: tballison <[email protected]> AuthorDate: Tue Mar 10 16:39:19 2026 -0400 Update documentation, further improvements... --- docs/modules/ROOT/nav.adoc | 1 + .../ROOT/pages/advanced/language-detection.adoc | 19 +++++ .../charsoup/CharSoupEncodingDetector.java | 97 +++++++++++++++++++--- .../org/apache/tika/eval/app/ExtractProfiler.java | 1 + .../org/apache/tika/eval/app/ProfilerBase.java | 21 ++++- .../java/org/apache/tika/eval/app/db/Cols.java | 3 +- .../eval/app/reports/MarkdownSummaryWriter.java | 42 ++++++++-- .../eval/core/metadata/TikaEvalMetadataFilter.java | 34 +++++++- .../core/metadata/TikaEvalMetadataFilterTest.java | 6 ++ .../charsoup/GenerativeLanguageModel.java | 22 +++++ 10 files changed, 226 insertions(+), 20 deletions(-) diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 5bd5b504e7..1702591425 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -40,6 +40,7 @@ * xref:advanced/index.adoc[Advanced] ** xref:advanced/charset-detection-design.adoc[Charset Detection Pipeline] ** xref:advanced/language-detection.adoc[Language Detection] +** xref:advanced/generative-language-model.adoc[Generative Language Model] ** xref:advanced/language-detection-build.adoc[Building the Language Detector] ** xref:advanced/robustness.adoc[Robustness] ** xref:advanced/setting-limits.adoc[Setting Limits] diff --git a/docs/modules/ROOT/pages/advanced/language-detection.adoc b/docs/modules/ROOT/pages/advanced/language-detection.adoc index 5c773a60cf..a8158af65b 100644 --- a/docs/modules/ROOT/pages/advanced/language-detection.adoc +++ b/docs/modules/ROOT/pages/advanced/language-detection.adoc @@ -204,6 +204,25 @@ CharSoupDetectorConfig cfg = CharSoupDetectorConfig.fromMap(Map.of( Or via Tika's JSON configuration mechanism if you are using `SelfConfiguring` component loading. +== Generative Language Model + +In addition to the discriminative models above, Tika ships a +**generative character n-gram model** (`langdetect-generative-v1.bin`) that +answers a complementary question: _how language-like is this text?_ + +The generative model is used for: + +* **Charset detection tiebreaking** — when the discriminative model cannot + distinguish candidate charsets, the generative model picks the one that + produces the most language-like decoded text. +* **Text quality scoring** — the `tika-eval:languageness` metadata field + provides a z-score indicating how normal or garbled the extracted text is. +* **Training data filtering** — flagging bot-generated or mixed-language + sentences in training corpora. + +For full details, see +xref:advanced/generative-language-model.adoc[Generative Language Model]. + == Training the Models Training is fully reproducible from source. For step-by-step instructions, diff --git a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java index c318e9a5dc..c02543ffa2 100644 --- a/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java +++ b/tika-encoding-detectors/tika-encoding-detector-charsoup/src/main/java/org/apache/tika/langdetect/charsoup/CharSoupEncodingDetector.java @@ -31,6 +31,9 @@ import java.util.List; import java.util.Map; import java.util.Set; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import org.apache.tika.config.TikaComponent; import org.apache.tika.detect.EncodingDetectorContext; import org.apache.tika.detect.EncodingResult; @@ -66,8 +69,31 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { private static final long serialVersionUID = 1L; + private static final Logger LOG = LoggerFactory.getLogger(CharSoupEncodingDetector.class); + private static final int DEFAULT_READ_LIMIT = 16384; + private static final String GLM_RESOURCE = GenerativeLanguageModel.DEFAULT_MODEL_RESOURCE; + + /** + * Minimum z-score for the generative-model tiebreaker to consider a + * candidate "language-like enough" to win. Candidates below this are + * treated as mojibake. + */ + private static final float MIN_GENERATIVE_ZSCORE = -4.0f; + + private static final GenerativeLanguageModel GLM; + + static { + GenerativeLanguageModel glm = null; + try { + glm = GenerativeLanguageModel.loadFromClasspath(GLM_RESOURCE); + } catch (IOException ignore) { + // Model not on classpath — generative tiebreaker unavailable + } + GLM = glm; + } + /** * Symmetric confusable peer groups: within each group, encoding variants * (e.g. ISO-8859-6 vs windows-1256) produce different decoded text for the @@ -171,14 +197,20 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { CharSoupLanguageDetector langDetector = new CharSoupLanguageDetector(); Charset bestCharset = langDetector.compareLanguageSignal(candidates); if (bestCharset == null) { - // Language signal inconclusive. When a DECLARATIVE result (HTML meta charset, - // BOM, HTTP Content-Type) exists and decodes the bytes at least as cleanly as - // the statistical fallback, trust the declaration. This covers: - // • Pure-ASCII probe (both decodings identical) — prefer the declared charset. - // • Probe with high bytes that are valid in BOTH charsets (e.g. Cyrillic in a - // page that starts with ASCII JavaScript) — the bytes look "clean" in both - // windows-1252 (decoded as Latin Extended) and windows-1251 (decoded as - // Cyrillic), so junkRatio cannot distinguish them; trust the declaration. + // Discriminative model inconclusive. Try generative model as tiebreaker. + Charset generativeWinner = generativeTiebreak(candidates); + if (generativeWinner != null) { + context.setArbitrationInfo("scored-inconclusive-generative-tiebreak"); + return generativeWinner; + } + + // Generative model also inconclusive. When a DECLARATIVE result + // (HTML meta charset, BOM, HTTP Content-Type) exists and decodes + // the bytes at least as cleanly as the statistical fallback, + // trust the declaration. This covers: + // • Pure-ASCII probe (both decodings identical) — prefer declared. + // • Probe with high bytes valid in BOTH charsets (e.g. Cyrillic + // in a page starting with ASCII JavaScript). Charset fallback = firstResult.getCharset(); String fallbackDecoded = candidates.get(fallback); float fallbackJunk = fallbackDecoded != null @@ -190,9 +222,6 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { String declaredDecoded = candidates.get(r.getCharset()); float declaredJunk = declaredDecoded != null ? CharSoupLanguageDetector.junkRatio(declaredDecoded) : 1f; - // Trust the declaration when it decodes at least as cleanly as - // the statistical fallback (≤ junk). A declaration that produces - // MORE junk than the fallback is likely wrong (e.g. a lying BOM). if (declaredJunk <= fallbackJunk) { cleanerDeclared = r.getCharset(); break; @@ -261,6 +290,52 @@ public class CharSoupEncodingDetector implements MetaEncodingDetector { return bestCharset; } + /** + * Generative-model tiebreaker: for each candidate charset's decoded text, + * detect the most likely language then compute its z-score. The charset + * producing the highest z-score (closest to "real language") wins, provided + * it exceeds {@link #MIN_GENERATIVE_ZSCORE}. + * + * @return the winning charset, or {@code null} if the generative model is + * unavailable or no candidate passes the threshold + */ + private static <K> K generativeTiebreak(Map<K, String> candidates) { + if (GLM == null || candidates.isEmpty()) { + return null; + } + + float bestZ = Float.NEGATIVE_INFINITY; + K bestKey = null; + + for (Map.Entry<K, String> entry : candidates.entrySet()) { + String text = entry.getValue(); + if (text == null || text.isEmpty()) { + continue; + } + if (CharSoupLanguageDetector.junkRatio(text) > 0.10f) { + continue; + } + Map.Entry<String, Float> match = GLM.bestMatch(text); + if (match == null) { + continue; + } + float z = GLM.zScoreLengthAdjusted(text, match.getKey()); + LOG.debug("generativeTiebreak: {} -> lang={} z={}", + entry.getKey(), match.getKey(), z); + if (!Float.isNaN(z) && z > bestZ) { + bestZ = z; + bestKey = entry.getKey(); + } + } + + if (bestZ < MIN_GENERATIVE_ZSCORE) { + LOG.debug("generativeTiebreak: inconclusive (bestZ={} < {})", + bestZ, MIN_GENERATIVE_ZSCORE); + return null; + } + return bestKey; + } + /** * Strip any leading byte-order mark from {@code bytes}, returning the * suffix after the BOM, or the original array if no BOM is found. diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java index 72ea50a139..0073f9ddbb 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ExtractProfiler.java @@ -60,6 +60,7 @@ public class ExtractProfiler extends ProfilerBase { new ColInfo(Cols.NUM_UNIQUE_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_TOKENS, Types.INTEGER), new ColInfo(Cols.COMMON_TOKENS_LANG, Types.VARCHAR, 12), new ColInfo(Cols.NUM_UNIQUE_COMMON_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_COMMON_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_UNIQUE_ALPHABETIC_TOKENS, Types.INTEGER), new ColInfo(Cols.NUM_ALPHABETIC_TOKENS, Types.INTEGER), new ColInfo(Cols.OOV, Types.DOUBLE), + new ColInfo(Cols.LANGUAGENESS, Types.DOUBLE), new ColInfo(Cols.TOP_N_TOKENS, Types.VARCHAR, 1024), new ColInfo(Cols.LANG_ID_1, Types.VARCHAR, 12), new ColInfo(Cols.LANG_ID_PROB_1, Types.FLOAT), new ColInfo(Cols.LANG_ID_2, Types.VARCHAR, 12), new ColInfo(Cols.LANG_ID_PROB_2, Types.FLOAT), new ColInfo(Cols.UNICODE_CHAR_BLOCKS, Types.VARCHAR, 1024), new ColInfo(Cols.TOKEN_ENTROPY_RATE, Types.FLOAT), new ColInfo(Cols.TOKEN_LENGTH_SUM, Types.INTEGER), new ColInfo(Cols.TOKEN_LENGTH_MEAN, Types.FLOAT), diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java index 0189d25d62..9e083b4057 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/ProfilerBase.java @@ -46,6 +46,7 @@ import org.apache.tika.eval.app.db.TableInfo; import org.apache.tika.eval.app.io.ExtractReaderException; import org.apache.tika.eval.app.io.IDBWriter; import org.apache.tika.eval.core.langid.LanguageIDWrapper; +import org.apache.tika.eval.core.metadata.TikaEvalMetadataFilter; import org.apache.tika.eval.core.textstats.BasicTokenCountStatsCalculator; import org.apache.tika.eval.core.textstats.CommonTokens; import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator; @@ -64,6 +65,7 @@ import org.apache.tika.eval.core.util.ContentTagParser; import org.apache.tika.eval.core.util.ContentTags; import org.apache.tika.eval.core.util.EvalExceptionUtils; import org.apache.tika.exception.TikaException; +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.PDF; @@ -460,9 +462,23 @@ public abstract class ProfilerBase { if (content == null || content.isBlank()) { content = ""; } - return compositeTextStatsCalculator.calculate(content); + Map<Class, Object> results = compositeTextStatsCalculator.calculate(content); + + GenerativeLanguageModel glm = TikaEvalMetadataFilter.getGenerativeModel(); + if (glm != null && !content.isEmpty()) { + List<LanguageResult> langs = + (List<LanguageResult>) results.get(LanguageIDWrapper.class); + if (langs != null && !langs.isEmpty()) { + float z = glm.zScoreLengthAdjusted(content, langs.get(0).getLanguage()); + results.put(LanguagenessMarker.class, z); + } + } + return results; } + /** Map-key sentinel so we can stash the z-score in the textStats map. */ + static final class LanguagenessMarker { } + /** * Checks to see if metadata is null or content is empty (null or only whitespace). * If any of these, then this does no processing, and the fileId is not @@ -495,6 +511,9 @@ public abstract class ProfilerBase { double oov = commonTokenResult.getAlphabeticTokens() > 0 ? commonTokenResult.getOOV() : -1.0; data.put(Cols.OOV, Double.toString(oov)); } + Float zScore = (Float) textStats.get(LanguagenessMarker.class); + double langness = (zScore != null && !Float.isNaN(zScore)) ? zScore : -99.0; + data.put(Cols.LANGUAGENESS, Double.toString(langness)); TokenCounts tokenCounts = (TokenCounts) textStats.get(BasicTokenCountStatsCalculator.class); if (tokenCounts != null) { diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java index 725c22e487..55b78423a8 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/db/Cols.java @@ -30,7 +30,8 @@ public enum Cols { //content CONTENT_LENGTH, NUM_UNIQUE_TOKENS, NUM_TOKENS, NUM_UNIQUE_ALPHABETIC_TOKENS, NUM_ALPHABETIC_TOKENS, //alphabetic or ideographic tokens COMMON_TOKENS_LANG, //which language was used for the common tokens metric? - NUM_UNIQUE_COMMON_TOKENS, NUM_COMMON_TOKENS, TOP_N_TOKENS, LANG_ID_1, LANG_ID_PROB_1, LANG_ID_2, OOV, LANG_ID_PROB_2, TOKEN_ENTROPY_RATE, TOKEN_LENGTH_SUM, TOKEN_LENGTH_MEAN, + NUM_UNIQUE_COMMON_TOKENS, NUM_COMMON_TOKENS, TOP_N_TOKENS, LANG_ID_1, LANG_ID_PROB_1, LANG_ID_2, + OOV, LANGUAGENESS, LANG_ID_PROB_2, TOKEN_ENTROPY_RATE, TOKEN_LENGTH_SUM, TOKEN_LENGTH_MEAN, TOKEN_LENGTH_STD_DEV, UNICODE_CHAR_BLOCKS, NUM_PAGES, //number of pages a document alleges it has NUM_OCR_PAGES, CONTENT_TRUNCATED_AT_MAX_LEN, // was the string truncated at AbstractProfiler.MAX_STRING_LENGTH diff --git a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java index 0681e57887..dfc2f5f8cb 100644 --- a/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java +++ b/tika-eval/tika-eval-app/src/main/java/org/apache/tika/eval/app/reports/MarkdownSummaryWriter.java @@ -465,9 +465,9 @@ public class MarkdownSummaryWriter { private static void writeOovComparison(Connection c, BufferedWriter w) throws IOException, SQLException { - w.write("## Out-of-Vocabulary (OOV) Rate Changes\n\n"); - w.write("Files where OOV rate increased significantly in B " + - "(possible mojibake or encoding regression).\n\n"); + w.write("## Out-of-Vocabulary (OOV) and Languageness Changes\n\n"); + w.write("Files where OOV rate increased or languageness z-score " + + "decreased in B (possible mojibake or encoding regression).\n\n"); w.write("### By Mime Type (aggregate)\n\n"); writeQueryAsTable(c, w, @@ -475,7 +475,10 @@ public class MarkdownSummaryWriter { "count(1) as FILES, " + "round(avg(ca.oov), 4) as MEAN_OOV_A, " + "round(avg(cb.oov), 4) as MEAN_OOV_B, " + - "round(avg(cb.oov) - avg(ca.oov), 4) as OOV_DELTA " + + "round(avg(cb.oov) - avg(ca.oov), 4) as OOV_DELTA, " + + "round(avg(ca.languageness), 2) as MEAN_LANG_A, " + + "round(avg(cb.languageness), 2) as MEAN_LANG_B, " + + "round(avg(cb.languageness) - avg(ca.languageness), 2) as LANG_DELTA " + "from contents_a ca " + "join contents_b cb on ca.id = cb.id " + "join profiles_a pa on ca.id = pa.id " + @@ -493,8 +496,10 @@ public class MarkdownSummaryWriter { "round(ca.oov, 4) as OOV_A, " + "round(cb.oov, 4) as OOV_B, " + "round(cb.oov - ca.oov, 4) as OOV_DELTA, " + - "ca.lang_id_1 as LANG_A, " + - "cb.lang_id_1 as LANG_B " + + "round(ca.languageness, 2) as LANG_A, " + + "round(cb.languageness, 2) as LANG_B, " + + "ca.lang_id_1 as LANG_ID_A, " + + "cb.lang_id_1 as LANG_ID_B " + "from contents_a ca " + "join contents_b cb on ca.id = cb.id " + "join profiles_a pa on ca.id = pa.id " + @@ -506,6 +511,31 @@ public class MarkdownSummaryWriter { "and (cb.oov - ca.oov) > 0.1 " + "order by (cb.oov - ca.oov) desc " + "limit " + TOP_N); + + w.write("\n### Top " + TOP_N + " Languageness Decreases\n\n"); + w.write("Files where the languageness z-score dropped the most " + + "(text became less language-like in B).\n\n"); + writeQueryAsTable(c, w, + "select c.file_path as FILE, " + + "ma.mime_string as MIME_A, " + + "round(ca.languageness, 2) as LANG_A, " + + "round(cb.languageness, 2) as LANG_B, " + + "round(cb.languageness - ca.languageness, 2) as LANG_DELTA, " + + "round(ca.oov, 4) as OOV_A, " + + "round(cb.oov, 4) as OOV_B, " + + "ca.lang_id_1 as LANG_ID_A, " + + "cb.lang_id_1 as LANG_ID_B " + + "from contents_a ca " + + "join contents_b cb on ca.id = cb.id " + + "join profiles_a pa on ca.id = pa.id " + + "join containers c on pa.container_id = c.container_id " + + "join mimes ma on ma.mime_id = pa.mime_id " + + "where pa.is_embedded = false " + + "and ca.languageness > -90 and cb.languageness > -90 " + + "and ca.num_tokens > 10 " + + "and (cb.languageness - ca.languageness) < -1.0 " + + "order by (cb.languageness - ca.languageness) asc " + + "limit " + TOP_N); w.write("\n"); } diff --git a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java index 5e788d9596..5d890b9989 100644 --- a/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java +++ b/tika-eval/tika-eval-core/src/main/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilter.java @@ -16,6 +16,7 @@ */ package org.apache.tika.eval.core.metadata; +import java.io.IOException; import java.util.ArrayList; import java.util.List; import java.util.Map; @@ -29,6 +30,7 @@ import org.apache.tika.eval.core.textstats.CompositeTextStatsCalculator; import org.apache.tika.eval.core.textstats.TextStatsCalculator; import org.apache.tika.eval.core.tokens.CommonTokenResult; import org.apache.tika.eval.core.tokens.TokenCounts; +import org.apache.tika.langdetect.charsoup.GenerativeLanguageModel; import org.apache.tika.language.detect.LanguageResult; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; @@ -37,6 +39,8 @@ import org.apache.tika.metadata.filter.MetadataFilterBase; public class TikaEvalMetadataFilter extends MetadataFilterBase { + private static final String GLM_RESOURCE = GenerativeLanguageModel.DEFAULT_MODEL_RESOURCE; + public static String TIKA_EVAL_NS = "tika-eval" + TikaCoreProperties.NAMESPACE_PREFIX_DELIMITER; public static Property NUM_TOKENS = Property.externalInteger(TIKA_EVAL_NS + "numTokens"); @@ -60,17 +64,35 @@ public class TikaEvalMetadataFilter extends MetadataFilterBase { public static Property OUT_OF_VOCABULARY = Property.externalReal(TIKA_EVAL_NS + "oov"); + public static Property LANGUAGENESS = Property.externalReal(TIKA_EVAL_NS + "languageness"); static CompositeTextStatsCalculator TEXT_STATS_CALCULATOR; + private static GenerativeLanguageModel GLM; static { List<TextStatsCalculator> calcs = new ArrayList<>(); calcs.add(new BasicTokenCountStatsCalculator()); calcs.add(new CommonTokens()); TEXT_STATS_CALCULATOR = new CompositeTextStatsCalculator(calcs); + + GenerativeLanguageModel glm = null; + try { + glm = GenerativeLanguageModel.loadFromClasspath(GLM_RESOURCE); + } catch (IOException e) { + // Model not on classpath — languageness scores will be -99 + } + GLM = glm; } + /** + * Returns the shared generative language model, or {@code null} if + * the model binary is not on the classpath. + */ + public static GenerativeLanguageModel getGenerativeModel() { + return GLM; + } + @Override public void filter(Metadata metadata) { String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); @@ -102,10 +124,20 @@ public class TikaEvalMetadataFilter extends MetadataFilterBase { //languages List<LanguageResult> probabilities = (List<LanguageResult>) results.get(LanguageIDWrapper.class); + String detectedLang = null; if (probabilities.size() > 0) { - metadata.set(LANGUAGE, probabilities.get(0).getLanguage()); + detectedLang = probabilities.get(0).getLanguage(); + metadata.set(LANGUAGE, detectedLang); metadata.set(LANGUAGE_CONFIDENCE, probabilities.get(0).getRawScore()); } + + //languageness z-score from generative model + if (GLM != null && detectedLang != null) { + float z = GLM.zScoreLengthAdjusted(content, detectedLang); + metadata.set(LANGUAGENESS, Float.isNaN(z) ? -99.0f : z); + } else { + metadata.set(LANGUAGENESS, -99.0f); + } } } diff --git a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java index d6c980b61e..34bf98bc62 100644 --- a/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java +++ b/tika-eval/tika-eval-core/src/test/java/org/apache/tika/eval/core/metadata/TikaEvalMetadataFilterTest.java @@ -17,6 +17,7 @@ package org.apache.tika.eval.core.metadata; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertTrue; import java.util.ArrayList; import java.util.List; @@ -62,6 +63,11 @@ public class TikaEvalMetadataFilterTest { double langConf = Double.parseDouble( metadata.get(TikaEvalMetadataFilter.LANGUAGE_CONFIDENCE)); assertEquals(1.0, langConf, 0.1); + + double languageness = Double.parseDouble( + metadata.get(TikaEvalMetadataFilter.LANGUAGENESS)); + assertTrue(languageness > -5.0, + "Expected reasonable languageness z-score for English text, got " + languageness); } } } diff --git a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java index 33a6bb38fd..ff228debaa 100644 --- a/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java +++ b/tika-langdetect/tika-langdetect-charsoup-core/src/main/java/org/apache/tika/langdetect/charsoup/GenerativeLanguageModel.java @@ -81,6 +81,10 @@ public class GenerativeLanguageModel { public static final int NONCJK_BIGRAM_BUCKETS = 8_192; public static final int NONCJK_TRIGRAM_BUCKETS = 16_384; + /** Default classpath resource path for the bundled generative model. */ + public static final String DEFAULT_MODEL_RESOURCE = + "/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin"; + /** * Quantization floor. Log-probabilities below this value are clamped * before quantizing; values stored in the table never go lower. @@ -505,6 +509,24 @@ public class GenerativeLanguageModel { // ---- Serialization ---- + /** + * Load a model from a classpath resource. + * + * @param resourcePath absolute classpath path, e.g. + * {@code "/org/apache/tika/langdetect/charsoup/langdetect-generative-v1-20260310.bin"} + * @return the loaded model + * @throws IOException if the resource is missing or malformed + */ + public static GenerativeLanguageModel loadFromClasspath(String resourcePath) + throws IOException { + try (InputStream is = GenerativeLanguageModel.class.getResourceAsStream(resourcePath)) { + if (is == null) { + throw new IOException("Classpath resource not found: " + resourcePath); + } + return load(is); + } + } + /** * Deserialize a model from the GLM1 binary format. */
