This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4585-simplify-serialization in repository https://gitbox.apache.org/repos/asf/tika.git
commit ff1281119dc3b7d4abb5fed2191074a7f9e1ec35 Author: tallison <[email protected]> AuthorDate: Sat Dec 20 22:45:19 2025 -0500 TIKA-4585 -- further progress - WIP --- .../apache/tika/parser/geo/topic/GeoParser.java | 14 +++ .../apache/tika/config/TikaDetectorConfigTest.java | 42 +++++---- .../DoublingContentHandlerDecoratorFactory.java | 2 + .../configs/TIKA-1702-detector-exclude.json | 2 +- .../configs/TIKA-1708-detector-default.json | 2 +- ...2273-encoding-detector-outside-static-init.json | 2 +- ...IKA-2273-exclude-encoding-detector-default.json | 2 +- .../TIKA-2273-no-icu4j-encoding-detector.json | 2 +- .../configs/test-default-with-exclusions.json | 2 +- .../test/resources/configs/tika-4424-config.json | 2 +- .../configs/tika-config-digests-pdf-only.json | 2 +- ...a-config-doubling-custom-handler-decorator.json | 4 +- .../resources/configs/tika-config-lib-pst.json | 2 +- .../resources/configs/tika-config-no-names.json | 5 +- ...a-config-upcasing-custom-handler-decorator.json | 54 +++++------ .../resources/configs/tika-config-with-names.json | 5 +- .../configs/tika-config-write-filter.json | 11 +-- .../test/resources/configs/tika-unrar-config.json | 2 +- .../org/apache/tika/config/TIKA-1558-exclude.json | 10 +-- .../tika/config/TIKA-1702-translator-default.json | 6 +- .../config/TIKA-1702-translator-empty-default.json | 6 +- .../tika/config/TIKA-1702-translator-empty.json | 6 +- .../resources/configs/tika-config-truncate.json | 7 +- .../org/apache/tika/config/loader/TikaLoader.java | 100 ++++++++++++++++++--- .../org/apache/tika/serialization/TikaModule.java | 6 ++ .../serdes/ParseContextDeserializer.java | 33 +++++-- .../serdes/ParseContextSerializer.java | 28 ++++-- 27 files changed, 244 insertions(+), 115 deletions(-) diff --git a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java index a797601e2..6208ade3b 100644 --- a/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java +++ b/tika-parsers/tika-parsers-ml/tika-parser-nlp-module/src/main/java/org/apache/tika/parser/geo/topic/GeoParser.java @@ -33,6 +33,8 @@ import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.JsonConfig; import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; @@ -58,6 +60,18 @@ public class GeoParser implements Parser { private NameFinderME nameFinder; private boolean available; + public GeoParser() { + // Default constructor - uses default GeoParserConfig + } + + public GeoParser(GeoParserConfig config) { + this.defaultConfig = config; + } + + public GeoParser(JsonConfig jsonConfig) { + this(ConfigDeserializer.buildConfig(jsonConfig, GeoParserConfig.class)); + } + @Override public Set<MediaType> getSupportedTypes(ParseContext parseContext) { return SUPPORTED_TYPES; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java index 60efea055..8123b397f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/config/TikaDetectorConfigTest.java @@ -113,29 +113,33 @@ public class TikaDetectorConfigTest extends TikaTest { private void assertDetectors(CompositeDetector detector, boolean shouldHavePOIFS, boolean shouldHaveZip) { - boolean hasZip = false; - boolean hasPOIFS = false; - for (Detector d : detector.getDetectors()) { - if (d instanceof DefaultZipContainerDetector) { - if (shouldHaveZip) { - hasZip = true; - } else { - fail("Shouldn't have the ZipContainerDetector from config"); - } - } - if (d instanceof POIFSContainerDetector) { - if (shouldHavePOIFS) { - hasPOIFS = true; - } else { - fail("Shouldn't have the POIFSContainerDetector from config"); - } - } + boolean hasZip = hasDetectorRecursively(detector, DefaultZipContainerDetector.class); + boolean hasPOIFS = hasDetectorRecursively(detector, POIFSContainerDetector.class); + + if (shouldHaveZip) { + assertTrue(hasZip, "Should have the ZipContainerDetector"); + } else if (hasZip) { + fail("Shouldn't have the ZipContainerDetector from config"); } + if (shouldHavePOIFS) { assertTrue(hasPOIFS, "Should have the POIFSContainerDetector"); + } else if (hasPOIFS) { + fail("Shouldn't have the POIFSContainerDetector from config"); } - if (shouldHaveZip) { - assertTrue(hasZip, "Should have the ZipContainerDetector"); + } + + private boolean hasDetectorRecursively(Detector detector, Class<? extends Detector> targetClass) { + if (targetClass.isInstance(detector)) { + return true; + } + if (detector instanceof CompositeDetector) { + for (Detector child : ((CompositeDetector) detector).getDetectors()) { + if (hasDetectorRecursively(child, targetClass)) { + return true; + } + } } + return false; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java index 60aa7c7d2..1cd2da3c1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/DoublingContentHandlerDecoratorFactory.java @@ -21,9 +21,11 @@ import java.util.Locale; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.TikaComponent; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; +@TikaComponent public class DoublingContentHandlerDecoratorFactory implements ContentHandlerDecoratorFactory { private static final char[] NEWLINE = new char[]{'\n'}; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json index 80a611f6b..fe356421d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1702-detector-exclude.json @@ -3,7 +3,7 @@ "detectors": [ { "default-detector": { - "_exclude": [ + "exclude": [ "default-zip-container-detector", "poifs-container-detector" ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json index 4c49c1e46..2c2e0e676 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-1708-detector-default.json @@ -3,7 +3,7 @@ "detectors": [ { "default-detector" : { - "_exclude": [ + "exclude": [ "default-zip-container-detector" ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json index c1818466e..2c05becdc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-encoding-detector-outside-static-init.json @@ -10,7 +10,7 @@ "encoding-detectors": [ { "default-encoding-detector" : { - "_exclude":["icu4j-encoding-detector"] + "exclude":["icu4j-encoding-detector"] } } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json index 56327103c..240924a28 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-exclude-encoding-detector-default.json @@ -3,7 +3,7 @@ "encoding-detectors": [ { "default-encoding-detector": { - "_exclude": [ + "exclude": [ "html-encoding-detector" ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json index b37a45121..809932615 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/TIKA-2273-no-icu4j-encoding-detector.json @@ -2,7 +2,7 @@ "encoding-detectors": [ { "default-encoding-detector":{ - "_exclude": [ + "exclude": [ "icu4j-encoding-detector" ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json index 5233e290f..c29e0f420 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/test-default-with-exclusions.json @@ -2,7 +2,7 @@ "parsers": [ { "default-parser": { - "_exclude": ["pdf-parser", "jsoup-parser"] + "exclude": ["pdf-parser", "jsoup-parser"] } } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json index 82a03978b..ceea6d09d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-4424-config.json @@ -2,7 +2,7 @@ "detectors": [ { "default-detector": { - "_exclude": ["default-zip-container-detector"] + "exclude": ["default-zip-container-detector"] } } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json index c23c2eb1f..124b07adc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests-pdf-only.json @@ -2,7 +2,7 @@ "parsers": [ { "default-parser": { - "_exclude": [ + "exclude": [ "pdf-parser" ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json index dcd44b7f3..012142231 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-doubling-custom-handler-decorator.json @@ -2,8 +2,6 @@ "auto-detect-parser": { "spoolToDisk": 1000, "outputThreshold": 1000, - "contentHandlerDecoratorFactory": { - "@class": "org.apache.tika.sax.DoublingContentHandlerDecoratorFactory" - } + "contentHandlerDecoratorFactory": "doubling-content-handler-decorator-factory" } } \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json index da45f42ee..1396afc7a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-lib-pst.json @@ -2,7 +2,7 @@ "parsers": [ { "default-parser": { - "_exclude": [ + "exclude": [ "outlook-pst-parser", "pst-mail-item-parser" ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json index 5938163df..2f0ac2a2f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-no-names.json @@ -3,8 +3,9 @@ "spoolToDisk": 123450, "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { - "@class": "org.apache.tika.extractor.RUnpackExtractorFactory", - "writeFileNameToContent": false + "runpack-extractor-factory": { + "writeFileNameToContent": false + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json index bc2ce44da..a58fa91fc 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-upcasing-custom-handler-decorator.json @@ -6,36 +6,36 @@ "maximumDepth": 1000, "maximumPackageEntryDepth": 1000, "metadataWriteFilterFactory": { - "@class": "org.apache.tika.metadata.writefilter.StandardWriteFilterFactory", - "includeFields": [ - ], - "excludeFields": [ - ], - "maxKeySize": 1024, - "maxFieldSize": 102400, - "maxTotalEstimatedBytes": 10485760, - "maxValuesPerField": 10, - "includeEmpty": false + "standard-write-filter-factory": { + "includeFields": [ + ], + "excludeFields": [ + ], + "maxKeySize": 1024, + "maxFieldSize": 102400, + "maxTotalEstimatedBytes": 10485760, + "maxValuesPerField": 10, + "includeEmpty": false + } }, "embeddedDocumentExtractorFactory": { - "@class": "org.apache.tika.extractor.RUnpackExtractorFactory", - "writeFileNameToContent": true, - "embeddedBytesIncludeMimeTypes": [ - "text/pdf" - ], - "embeddedBytesExcludeMimeTypes": [ - "rtf/application" - ], - "embeddedBytesIncludeEmbeddedResourceTypes": [ - "appended" - ], - "embeddedBytesExcludeEmbeddedResourceTypes": [ - ], - "maxEmbeddedBytesForExtraction": 10737418240 - }, - "contentHandlerDecoratorFactory": { - "@class": "org.apache.tika.sax.UpcasingContentHandlerDecoratorFactory" + "runpack-extractor-factory": { + "writeFileNameToContent": true, + "embeddedBytesIncludeMimeTypes": [ + "text/pdf" + ], + "embeddedBytesExcludeMimeTypes": [ + "rtf/application" + ], + "embeddedBytesIncludeEmbeddedResourceTypes": [ + "appended" + ], + "embeddedBytesExcludeEmbeddedResourceTypes": [ + ], + "maxEmbeddedBytesForExtraction": 10737418240 + } }, + "contentHandlerDecoratorFactory": "upcasing-content-handler-decorator-factory", "skipContainerDocumentDigest": false, "digesterFactory": { "commons-digester-factory": {} diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json index ea1519ec0..0659adb85 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-with-names.json @@ -3,8 +3,9 @@ "spoolToDisk": 123450, "outputThreshold": 678900, "embeddedDocumentExtractorFactory": { - "@class": "org.apache.tika.extractor.RUnpackExtractorFactory", - "writeFileNameToContent": true + "runpack-extractor-factory": { + "writeFileNameToContent": true + } } } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json index 39141be2c..1b6f13c1c 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-write-filter.json @@ -13,11 +13,12 @@ } }, "metadataWriteFilterFactory": { - "@class": "org.apache.tika.metadata.writefilter.StandardWriteFilterFactory", - "includeFields": [ - "X-TIKA-CONTENT", - "dc:creator" - ] + "standard-write-filter-factory": { + "includeFields": [ + "X-TIKA-CONTENT", + "dc:creator" + ] + } }, "throwOnZeroBytes": false } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json index de3fd5b32..5511b90b7 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-unrar-config.json @@ -2,7 +2,7 @@ "parsers": [ { "default-parser": { - "_exclude": ["rar-parser"] + "exclude": ["rar-parser"] } }, { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json index d25f49d85..72c9653da 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1558-exclude.json @@ -1,14 +1,14 @@ { "parsers": [ { - "default-parser": { - "_exclude": ["executable-parser"], - "_mime-exclude": ["image/jpeg", "application/pdf"] + "empty-parser": { + "_mime-include": ["application/pdf"] } }, { - "empty-parser": { - "_mime-include": ["application/pdf"] + "default-parser": { + "exclude": ["executable-parser"], + "_mime-exclude": ["image/jpeg", "application/pdf"] } } ] diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json index aa268b6c4..a6f122528 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-default.json @@ -1,5 +1,5 @@ { - "translator": { - "default-translator": {} - } + "translator": [ + "default-translator" + ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json index 73ad08c22..4439a14ba 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty-default.json @@ -1,5 +1,5 @@ { - "translator": { - "empty-translator": {} - } + "translator": [ + "empty-translator" + ] } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json index 73ad08c22..4439a14ba 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/org/apache/tika/config/TIKA-1702-translator-empty.json @@ -1,5 +1,5 @@ { - "translator": { - "empty-translator": {} - } + "translator": [ + "empty-translator" + ] } diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json index fd8592020..873ce685a 100644 --- a/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json +++ b/tika-pipes/tika-pipes-integration-tests/src/test/resources/configs/tika-config-truncate.json @@ -56,9 +56,10 @@ "mock-digester-factory": {} }, "embeddedDocumentExtractorFactory": { - "@class": "org.apache.tika.extractor.RUnpackExtractorFactory", - "writeFileNameToContent": false, - "maxEmbeddedBytesForExtraction": 10 + "runpack-extractor-factory": { + "writeFileNameToContent": false, + "maxEmbeddedBytesForExtraction": 10 + } }, "throwOnZeroBytes": false }, diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 42f9b5992..115480d6e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -35,15 +35,18 @@ import com.fasterxml.jackson.databind.node.ObjectNode; import org.apache.tika.config.GlobalSettings; import org.apache.tika.config.ServiceLoader; import org.apache.tika.detect.CompositeDetector; +import org.apache.tika.detect.CompositeEncodingDetector; import org.apache.tika.detect.DefaultDetector; import org.apache.tika.detect.DefaultEncodingDetector; import org.apache.tika.detect.Detector; import org.apache.tika.detect.EncodingDetector; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.language.translate.DefaultTranslator; import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; +import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.AbstractEncodingDetectorParser; @@ -114,9 +117,17 @@ public class TikaLoader { ComponentConfig.builder("encoding-detectors", EncodingDetector.class) .loadAsList() - .wrapWith(list -> list.isEmpty() - ? new DefaultEncodingDetector() - : new org.apache.tika.detect.CompositeEncodingDetector((List<EncodingDetector>) list)) + .wrapWith(list -> { + if (list.isEmpty()) { + return new DefaultEncodingDetector(); + } else if (list.size() == 1 && list.get(0) instanceof CompositeEncodingDetector) { + // Don't double-wrap if single item is already a CompositeEncodingDetector + // (e.g., DefaultEncodingDetector with exclusions) + return (EncodingDetector) list.get(0); + } else { + return new org.apache.tika.detect.CompositeEncodingDetector((List<EncodingDetector>) list); + } + }) .defaultProvider(DefaultEncodingDetector::new) .register(); @@ -136,6 +147,7 @@ public class TikaLoader { ComponentConfig.builder("translator", Translator.class) .loadAsList() .wrapWith(list -> list.isEmpty() ? null : (Translator) list.get(0)) + .defaultProvider(DefaultTranslator::new) .register(); } @@ -158,8 +170,11 @@ public class TikaLoader { // Pending configs for deferred creation of DefaultParser/DefaultDetector/DefaultEncodingDetector // These are created in post-processing to avoid double-creation private JsonNode pendingDefaultParserConfig; + private int pendingDefaultParserIndex = -1; private JsonNode pendingDefaultDetectorConfig; + private int pendingDefaultDetectorIndex = -1; private JsonNode pendingDefaultEncodingDetectorConfig; + private int pendingDefaultEncodingDetectorIndex = -1; private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) { this.config = config; @@ -599,7 +614,14 @@ public class TikaLoader { // If empty and has default, use default if (componentList.isEmpty()) { if (componentConfig.hasDefault()) { - return componentConfig.getDefault(); + T defaultComponent = componentConfig.getDefault(); + // For Parser defaults, configure dependencies (encoding detector, renderer) + if (componentClass == Parser.class && defaultComponent instanceof Parser) { + List<Parser> singletonList = new ArrayList<>(); + singletonList.add((Parser) defaultComponent); + configureParserDependencies(singletonList); + } + return defaultComponent; } // For components that wrap empty lists if (componentConfig.hasListWrapper()) { @@ -693,21 +715,29 @@ public class TikaLoader { } List<T> components = new ArrayList<>(); + int index = 0; for (Map.Entry<String, JsonNode> entry : entries) { String typeName = entry.getKey(); JsonNode configNode = entry.getValue(); // Defer DefaultParser/DefaultDetector/DefaultEncodingDetector creation to post-processing + // Track the index where it was defined to preserve ordering if ("default-parser".equals(typeName) && componentClass == Parser.class) { pendingDefaultParserConfig = configNode; + pendingDefaultParserIndex = index; + index++; continue; } if ("default-detector".equals(typeName) && componentClass == Detector.class) { pendingDefaultDetectorConfig = configNode; + pendingDefaultDetectorIndex = index; + index++; continue; } if ("default-encoding-detector".equals(typeName) && componentClass == EncodingDetector.class) { pendingDefaultEncodingDetectorConfig = configNode; + pendingDefaultEncodingDetectorIndex = index; + index++; continue; } @@ -719,6 +749,7 @@ public class TikaLoader { // Deserialize using Jackson (TikaModule handles type resolution) T component = objectMapper.treeToValue(wrapperNode, componentClass); components.add(component); + index++; } catch (Exception e) { throw new TikaConfigException( "Failed to load " + componentClass.getSimpleName() + ": " + typeName, e); @@ -733,6 +764,8 @@ public class TikaLoader { /** * Creates DefaultParser (if configured) with config exclusions + auto-exclusions. * Auto-exclusions are the explicit parser types to prevent duplicates. + * Inserts at the original position to preserve ordering. + * Also applies mime filtering (_mime-include/_mime-exclude) if configured. * <p> * Note: EncodingDetector and Renderer are configured later in configureParserDependencies. */ @@ -752,19 +785,49 @@ public class TikaLoader { } // Create DefaultParser with all exclusions - List<Parser> result = new ArrayList<>(parsers); - result.add(new DefaultParser( + Parser defaultParser = new DefaultParser( getMediaTypeRegistry(), new ServiceLoader(classLoader), - exclusions)); + exclusions); + + // Apply mime filtering if configured + Set<MediaType> includeTypes = extractMimeTypes(pendingDefaultParserConfig, "_mime-include"); + Set<MediaType> excludeTypes = extractMimeTypes(pendingDefaultParserConfig, "_mime-exclude"); + if (!includeTypes.isEmpty() || !excludeTypes.isEmpty()) { + defaultParser = ParserDecorator.withMimeFilters(defaultParser, includeTypes, excludeTypes); + } + + // Insert at original position to preserve ordering + List<Parser> result = new ArrayList<>(parsers); + int insertIndex = Math.min(pendingDefaultParserIndex, result.size()); + result.add(insertIndex, defaultParser); pendingDefaultParserConfig = null; + pendingDefaultParserIndex = -1; return result; } + /** + * Extracts mime types from a config node field. + */ + private Set<MediaType> extractMimeTypes(JsonNode configNode, String fieldName) { + Set<MediaType> types = new HashSet<>(); + if (configNode == null || !configNode.has(fieldName)) { + return types; + } + JsonNode arrayNode = configNode.get(fieldName); + if (arrayNode.isArray()) { + for (JsonNode typeNode : arrayNode) { + types.add(MediaType.parse(typeNode.asText())); + } + } + return types; + } + /** * Creates DefaultDetector (if configured) with config exclusions + auto-exclusions. * Auto-exclusions are the explicit detector types to prevent duplicates. + * Inserts at the original position to preserve ordering. */ @SuppressWarnings("unchecked") private List<Detector> applyDetectorAutoExclusions(List<Detector> detectors) throws IOException { @@ -782,19 +845,25 @@ public class TikaLoader { } // Create DefaultDetector with all exclusions - List<Detector> result = new ArrayList<>(detectors); - result.add(new DefaultDetector( + DefaultDetector defaultDetector = new DefaultDetector( getMimeTypes(), new ServiceLoader(classLoader), - exclusions)); + exclusions); + + // Insert at original position to preserve ordering + List<Detector> result = new ArrayList<>(detectors); + int insertIndex = Math.min(pendingDefaultDetectorIndex, result.size()); + result.add(insertIndex, defaultDetector); pendingDefaultDetectorConfig = null; + pendingDefaultDetectorIndex = -1; return result; } /** * Creates DefaultEncodingDetector (if configured) with config exclusions + auto-exclusions. * Auto-exclusions are the explicit encoding detector types to prevent duplicates. + * Inserts at the original position to preserve ordering. */ @SuppressWarnings("unchecked") private List<EncodingDetector> applyEncodingDetectorAutoExclusions(List<EncodingDetector> encodingDetectors) @@ -814,12 +883,17 @@ public class TikaLoader { } // Create DefaultEncodingDetector with all exclusions - List<EncodingDetector> result = new ArrayList<>(encodingDetectors); - result.add(new DefaultEncodingDetector( + DefaultEncodingDetector defaultEncodingDetector = new DefaultEncodingDetector( new ServiceLoader(classLoader), - exclusions)); + exclusions); + + // Insert at original position to preserve ordering + List<EncodingDetector> result = new ArrayList<>(encodingDetectors); + int insertIndex = Math.min(pendingDefaultEncodingDetectorIndex, result.size()); + result.add(insertIndex, defaultEncodingDetector); pendingDefaultEncodingDetectorConfig = null; + pendingDefaultEncodingDetectorIndex = -1; return result; } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java index 14f0fda5f..b50709702 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java @@ -56,10 +56,12 @@ import org.apache.tika.language.translate.Translator; import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.writefilter.MetadataWriteFilterFactory; import org.apache.tika.mime.MediaType; +import org.apache.tika.mime.MimeTypes; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.renderer.Renderer; +import org.apache.tika.sax.ContentHandlerDecoratorFactory; import org.apache.tika.serialization.serdes.DefaultDetectorSerializer; import org.apache.tika.serialization.serdes.DefaultParserSerializer; @@ -100,6 +102,7 @@ public class TikaModule extends SimpleModule { COMPACT_FORMAT_INTERFACES.add(DigesterFactory.class); COMPACT_FORMAT_INTERFACES.add(EmbeddedDocumentExtractorFactory.class); COMPACT_FORMAT_INTERFACES.add(MetadataWriteFilterFactory.class); + COMPACT_FORMAT_INTERFACES.add(ContentHandlerDecoratorFactory.class); } /** @@ -282,6 +285,9 @@ public class TikaModule extends SimpleModule { } else if (clazz == DefaultDetector.class) { throw new IOException("DefaultDetector must be loaded via TikaLoader, not directly " + "via Jackson deserialization. Use TikaLoader.load() to load configuration."); + } else if (clazz == MimeTypes.class) { + // MimeTypes must use the singleton to have all type definitions loaded + instance = MimeTypes.getDefaultMimeTypes(); } else if (cleanedConfig == null || cleanedConfig.isEmpty()) { // If no config, use default constructor instance = clazz.getDeclaredConstructor().newInstance(); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index 3839ce57a..997fe6e4f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -121,20 +121,37 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { String componentName = fieldNames.next(); JsonNode configNode = typedNode.get(componentName); + Class<?> configClass = null; + + // First, try component registry lookup (for friendly names like "pdf-parser-config") try { - // Look up the class for this component name - Class<?> configClass = ComponentNameResolver.resolveClass( + configClass = ComponentNameResolver.resolveClass( componentName, ParseContextDeserializer.class.getClassLoader()); + } catch (ClassNotFoundException e) { + // Not in registry, try as FQCN + } + + // If not found in registry, try as fully qualified class name + if (configClass == null) { + try { + configClass = Class.forName(componentName); + } catch (ClassNotFoundException e) { + LOG.warn("Could not find class for typed component '{}', storing as JSON config", + componentName); + // Fall back to storing as JSON config + parseContext.setJsonConfig(componentName, mapper.writeValueAsString(configNode)); + continue; + } + } - // Deserialize and add to context + // Deserialize and add to context + try { Object config = mapper.treeToValue(configNode, configClass); parseContext.set((Class) configClass, config); - LOG.debug("Deserialized typed object '{}' -> {}", componentName, configClass.getName()); - } catch (ClassNotFoundException e) { - LOG.warn("Could not find class for typed component '{}', storing as JSON config", - componentName); - // Fall back to storing as JSON config + } catch (Exception e) { + LOG.warn("Failed to deserialize typed component '{}' as {}, storing as JSON config", + componentName, configClass.getName(), e); parseContext.setJsonConfig(componentName, mapper.writeValueAsString(configNode)); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java index 0e966f046..903c48f3e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java @@ -62,16 +62,26 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { for (Map.Entry<String, Object> entry : contextMap.entrySet()) { String className = entry.getKey(); - String componentName = findComponentName(className); - if (componentName != null) { - if (!hasTypedObjects) { - gen.writeFieldName(TYPED); - gen.writeStartObject(); - hasTypedObjects = true; - } - gen.writeFieldName(componentName); - gen.writeRawValue(mapper.writeValueAsString(entry.getValue())); + Object value = entry.getValue(); + + // Skip null values + if (value == null) { + continue; + } + + // Try to find a friendly component name, otherwise use FQCN + String keyName = findComponentName(className); + if (keyName == null) { + keyName = className; + } + + if (!hasTypedObjects) { + gen.writeFieldName(TYPED); + gen.writeStartObject(); + hasTypedObjects = true; } + gen.writeFieldName(keyName); + gen.writeRawValue(mapper.writeValueAsString(value)); } if (hasTypedObjects) {
