This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4585-simplify-serialization in repository https://gitbox.apache.org/repos/asf/tika.git
commit fb4cb1c5a67bfcacfba73ce1b7c995f9cc61b387 Author: tallison <[email protected]> AuthorDate: Sat Dec 20 21:28:22 2025 -0500 TIKA-4585 -- further progress - WIP --- .../org/apache/tika/parser/image/PSDParser.java | 19 ++ .../apache/tika/parser/csv/TextAndCSVParser.java | 2 +- .../apache/tika/parser/strings/StringsParser.java | 11 +- .../org/apache/tika/config/loader/TikaLoader.java | 249 ++++++++++++++------- .../org/apache/tika/serialization/TikaModule.java | 15 +- .../serdes/DefaultDetectorDeserializer.java | 48 ---- .../serdes/DefaultParserDeserializer.java | 48 ---- .../serdes/SpiCompositeDeserializer.java | 97 -------- 8 files changed, 210 insertions(+), 279 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java index e77947e87..dc4e6072e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java @@ -30,6 +30,8 @@ import org.apache.commons.io.input.UnsynchronizedByteArrayInputStream; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.JsonConfig; import org.apache.tika.config.TikaComponent; import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; @@ -70,6 +72,16 @@ public class PSDParser implements Parser { private int maxDataLengthBytes = MAX_DATA_LENGTH_BYTES; + public PSDParser() { + } + + public PSDParser(JsonConfig jsonConfig) { + PSDParserConfig config = ConfigDeserializer.buildConfig(jsonConfig, PSDParserConfig.class); + if (config != null && config.maxDataLengthBytes > 0) { + this.maxDataLengthBytes = config.maxDataLengthBytes; + } + } + public Set<MediaType> getSupportedTypes(ParseContext context) { return SUPPORTED_TYPES; } @@ -271,4 +283,11 @@ public class PSDParser implements Parser { return new String(data, 0, data.length - 1, US_ASCII); } } + + /** + * Configuration class for PSDParser. + */ + public static class PSDParserConfig { + public int maxDataLengthBytes; + } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java index c5d0ffe48..66396adf2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/csv/TextAndCSVParser.java @@ -70,7 +70,7 @@ import org.apache.tika.sax.XHTMLContentHandler; * {@link org.apache.commons.csv.CSVParser} is lost. * </p> */ -@TikaComponent +@TikaComponent(name = "text-and-csv-parser") public class TextAndCSVParser extends AbstractEncodingDetectorParser { static final MediaType CSV = MediaType.text("csv"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java index 427eabbeb..aac74f624 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java @@ -34,7 +34,9 @@ import org.apache.commons.io.IOUtils; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.ConfigDeserializer; import org.apache.tika.config.Initializable; +import org.apache.tika.config.JsonConfig; import org.apache.tika.config.TikaComponent; import org.apache.tika.detect.FileCommandDetector; import org.apache.tika.exception.TikaConfigException; @@ -67,7 +69,7 @@ public class StringsParser implements Parser, Initializable { private static final Set<MediaType> SUPPORTED_TYPES = Collections.singleton(MediaType.OCTET_STREAM); - private final StringsConfig defaultStringsConfig = new StringsConfig(); + private StringsConfig defaultStringsConfig = new StringsConfig(); private String filePath = ""; @@ -78,6 +80,13 @@ public class StringsParser implements Parser, Initializable { private String stringsPath = ""; + public StringsParser() { + } + + public StringsParser(JsonConfig jsonConfig) { + defaultStringsConfig = ConfigDeserializer.buildConfig(jsonConfig, StringsConfig.class); + } + public static String getStringsProg() { return SystemUtils.IS_OS_WINDOWS ? "strings.exe" : "strings"; } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 5748d894b..42f9b5992 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -46,11 +46,14 @@ import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.mime.MimeTypes; +import org.apache.tika.parser.AbstractEncodingDetectorParser; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.AutoDetectParserConfig; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.RenderingParser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; import org.apache.tika.serialization.ComponentConfig; @@ -152,6 +155,12 @@ public class TikaLoader { private ConfigLoader configLoader; private GlobalSettings globalSettings; + // Pending configs for deferred creation of DefaultParser/DefaultDetector/DefaultEncodingDetector + // These are created in post-processing to avoid double-creation + private JsonNode pendingDefaultParserConfig; + private JsonNode pendingDefaultDetectorConfig; + private JsonNode pendingDefaultEncodingDetectorConfig; + private TikaLoader(TikaJsonConfig config, ClassLoader classLoader) { this.config = config; this.classLoader = classLoader; @@ -581,16 +590,10 @@ public class TikaLoader { private <T> T loadComponent(ComponentConfig<T> componentConfig) throws TikaConfigException, IOException { Class<T> componentClass = componentConfig.getComponentClass(); - // Handle dependencies: Parsers need EncodingDetectors and Renderers loaded first - if (componentClass == Parser.class) { - get(EncodingDetector.class); - get(Renderer.class); - } - // Load the component list List<T> componentList = loadComponentList(componentConfig.getJsonField(), componentClass); - // Apply post-processing (auto-exclusions for Parser/Detector) + // Apply post-processing (auto-exclusions for Parser/Detector, configure dependencies) componentList = applyPostProcessing(componentClass, componentList); // If empty and has default, use default @@ -616,22 +619,66 @@ public class TikaLoader { /** * Applies post-processing to component lists. - * Currently handles auto-exclusions for Parser and Detector. + * Handles auto-exclusions and deferred creation for Parser, Detector, and EncodingDetector. + * Also sets EncodingDetector and Renderer on parsers that implement the appropriate interfaces. */ @SuppressWarnings("unchecked") - private <T> List<T> applyPostProcessing(Class<T> componentClass, List<T> list) { + private <T> List<T> applyPostProcessing(Class<T> componentClass, List<T> list) throws IOException { if (componentClass == Parser.class) { - return (List<T>) applyParserAutoExclusions((List<Parser>) list); + List<Parser> parsers = applyParserAutoExclusions((List<Parser>) list); + configureParserDependencies(parsers); + return (List<T>) parsers; } else if (componentClass == Detector.class) { return (List<T>) applyDetectorAutoExclusions((List<Detector>) list); + } else if (componentClass == EncodingDetector.class) { + return (List<T>) applyEncodingDetectorAutoExclusions((List<EncodingDetector>) list); } return list; } + /** + * Configures EncodingDetector and Renderer on all parsers. + * Recursively walks through CompositeParser children (including DefaultParser). + */ + private void configureParserDependencies(List<Parser> parsers) throws IOException { + EncodingDetector encodingDetector = get(EncodingDetector.class); + Renderer renderer = get(Renderer.class); + + for (Parser parser : parsers) { + // Recursively configure all parsers including DefaultParser's children + configureParserRecursively(parser, encodingDetector, renderer); + } + } + + /** + * Recursively configures a parser and its children with EncodingDetector and Renderer. + */ + private void configureParserRecursively(Parser parser, EncodingDetector encodingDetector, + Renderer renderer) { + if (encodingDetector != null && parser instanceof AbstractEncodingDetectorParser) { + ((AbstractEncodingDetectorParser) parser).setEncodingDetector(encodingDetector); + } + if (renderer != null && parser instanceof RenderingParser) { + ((RenderingParser) parser).setRenderer(renderer); + } + if (parser instanceof CompositeParser) { + for (Parser child : ((CompositeParser) parser).getAllComponentParsers()) { + configureParserRecursively(child, encodingDetector, renderer); + } + } else if (parser instanceof ParserDecorator) { + configureParserRecursively(((ParserDecorator) parser).getWrappedParser(), + encodingDetector, renderer); + } + } + // ==================== Component List Loading ==================== /** * Loads a list of components from the JSON configuration. + * <p> + * DefaultParser and DefaultDetector are handled specially - their configs are stored + * for deferred creation in post-processing to avoid double-creation when auto-exclusions + * are needed. * * @param jsonField the JSON field name (e.g., "parsers", "detectors") * @param componentClass the component class @@ -650,6 +697,20 @@ public class TikaLoader { String typeName = entry.getKey(); JsonNode configNode = entry.getValue(); + // Defer DefaultParser/DefaultDetector/DefaultEncodingDetector creation to post-processing + if ("default-parser".equals(typeName) && componentClass == Parser.class) { + pendingDefaultParserConfig = configNode; + continue; + } + if ("default-detector".equals(typeName) && componentClass == Detector.class) { + pendingDefaultDetectorConfig = configNode; + continue; + } + if ("default-encoding-detector".equals(typeName) && componentClass == EncodingDetector.class) { + pendingDefaultEncodingDetectorConfig = configNode; + continue; + } + try { // Create wrapper node: { "type-name": {...config...} } ObjectNode wrapperNode = objectMapper.createObjectNode(); @@ -670,91 +731,127 @@ public class TikaLoader { // ==================== Auto-Exclusion Post-Processing ==================== /** - * Applies auto-exclusion to parsers: when both explicit parsers and DefaultParser - * are configured, the explicit parser classes are automatically excluded from - * DefaultParser's SPI loading to prevent duplicates. + * Creates DefaultParser (if configured) with config exclusions + auto-exclusions. + * Auto-exclusions are the explicit parser types to prevent duplicates. + * <p> + * Note: EncodingDetector and Renderer are configured later in configureParserDependencies. */ @SuppressWarnings("unchecked") - private List<Parser> applyParserAutoExclusions(List<Parser> parsers) { - // Find all explicitly configured parser types (not DefaultParser) - Set<Class<?>> explicitTypes = new HashSet<>(); - for (Parser p : parsers) { - if (!(p instanceof DefaultParser)) { - explicitTypes.add(p.getClass()); - } - } - - // If no explicit types, no auto-exclusions needed - but still preserve the list - if (explicitTypes.isEmpty()) { + private List<Parser> applyParserAutoExclusions(List<Parser> parsers) throws IOException { + // If no DefaultParser was configured, just return the list + if (pendingDefaultParserConfig == null) { return parsers; } - // Recreate any DefaultParser with explicit types as auto-exclusions - List<Parser> adjusted = new ArrayList<>(); + // Parse exclusions from config + Set<Class<? extends Parser>> exclusions = parseExclusions(pendingDefaultParserConfig, Parser.class); + + // Add auto-exclusions (explicit parser types) for (Parser p : parsers) { - if (p instanceof DefaultParser dp) { - // Combine explicit exclusions from config with auto-exclusions - Set<Class<? extends Parser>> combinedExclusions = new HashSet<>(); - - // Add config-specified exclusions - combinedExclusions.addAll(dp.getExcludedClasses()); - - // Add auto-exclusions (explicit parser types) - combinedExclusions.addAll((Set<Class<? extends Parser>>) (Set<?>) explicitTypes); - - adjusted.add(new DefaultParser( - getMediaTypeRegistry(), - new ServiceLoader(classLoader), - combinedExclusions)); - } else { - adjusted.add(p); - } + exclusions.add((Class<? extends Parser>) p.getClass()); } - return adjusted; + + // Create DefaultParser with all exclusions + List<Parser> result = new ArrayList<>(parsers); + result.add(new DefaultParser( + getMediaTypeRegistry(), + new ServiceLoader(classLoader), + exclusions)); + + pendingDefaultParserConfig = null; + return result; } /** - * Applies auto-exclusion to detectors: when both explicit detectors and DefaultDetector - * are configured, the explicit detector classes are automatically excluded from - * DefaultDetector's SPI loading to prevent duplicates. + * Creates DefaultDetector (if configured) with config exclusions + auto-exclusions. + * Auto-exclusions are the explicit detector types to prevent duplicates. */ @SuppressWarnings("unchecked") - private List<Detector> applyDetectorAutoExclusions(List<Detector> detectors) { - // Find all explicitly configured detector types (not DefaultDetector) - Set<Class<?>> explicitTypes = new HashSet<>(); + private List<Detector> applyDetectorAutoExclusions(List<Detector> detectors) throws IOException { + // If no DefaultDetector was configured, just return the list + if (pendingDefaultDetectorConfig == null) { + return detectors; + } + + // Parse exclusions from config + Set<Class<? extends Detector>> exclusions = parseExclusions(pendingDefaultDetectorConfig, Detector.class); + + // Add auto-exclusions (explicit detector types) for (Detector d : detectors) { - if (!(d instanceof DefaultDetector)) { - explicitTypes.add(d.getClass()); - } + exclusions.add((Class<? extends Detector>) d.getClass()); } - // If no explicit types, no auto-exclusions needed - but still preserve the list - if (explicitTypes.isEmpty()) { - return detectors; + // Create DefaultDetector with all exclusions + List<Detector> result = new ArrayList<>(detectors); + result.add(new DefaultDetector( + getMimeTypes(), + new ServiceLoader(classLoader), + exclusions)); + + pendingDefaultDetectorConfig = null; + return result; + } + + /** + * Creates DefaultEncodingDetector (if configured) with config exclusions + auto-exclusions. + * Auto-exclusions are the explicit encoding detector types to prevent duplicates. + */ + @SuppressWarnings("unchecked") + private List<EncodingDetector> applyEncodingDetectorAutoExclusions(List<EncodingDetector> encodingDetectors) + throws IOException { + // If no DefaultEncodingDetector was configured, just return the list + if (pendingDefaultEncodingDetectorConfig == null) { + return encodingDetectors; } - // Recreate any DefaultDetector with explicit types as auto-exclusions - List<Detector> adjusted = new ArrayList<>(); - for (Detector d : detectors) { - if (d instanceof DefaultDetector dd) { - // Combine explicit exclusions from config with auto-exclusions - Set<Class<? extends Detector>> combinedExclusions = new HashSet<>(); - - // Add config-specified exclusions - combinedExclusions.addAll(dd.getExcludedClasses()); - - // Add auto-exclusions (explicit detector types) - combinedExclusions.addAll((Set<Class<? extends Detector>>) (Set<?>) explicitTypes); - - adjusted.add(new DefaultDetector( - getMimeTypes(), - new ServiceLoader(classLoader), - combinedExclusions)); - } else { - adjusted.add(d); + // Parse exclusions from config + Set<Class<? extends EncodingDetector>> exclusions = + parseExclusions(pendingDefaultEncodingDetectorConfig, EncodingDetector.class); + + // Add auto-exclusions (explicit encoding detector types) + for (EncodingDetector ed : encodingDetectors) { + exclusions.add((Class<? extends EncodingDetector>) ed.getClass()); + } + + // Create DefaultEncodingDetector with all exclusions + List<EncodingDetector> result = new ArrayList<>(encodingDetectors); + result.add(new DefaultEncodingDetector( + new ServiceLoader(classLoader), + exclusions)); + + pendingDefaultEncodingDetectorConfig = null; + return result; + } + + /** + * Parses exclusions from a config node. + * Supports both "exclude" and "_exclude" field names. + */ + @SuppressWarnings("unchecked") + private <T> Set<Class<? extends T>> parseExclusions(JsonNode configNode, Class<T> componentClass) + throws IOException { + Set<Class<? extends T>> exclusions = new HashSet<>(); + + if (configNode == null || !configNode.isObject()) { + return exclusions; + } + + JsonNode excludeNode = configNode.has("exclude") ? + configNode.get("exclude") : configNode.get("_exclude"); + + if (excludeNode != null && excludeNode.isArray()) { + for (JsonNode item : excludeNode) { + String typeName = item.asText(); + try { + Class<?> clazz = ComponentNameResolver.resolveClass(typeName, classLoader); + exclusions.add((Class<? extends T>) clazz); + } catch (ClassNotFoundException e) { + throw new IOException("Unknown type in exclude list: " + typeName, e); + } } } - return adjusted; + + return exclusions; } // ==================== Serialization ==================== diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java index 845cdec3c..14f0fda5f 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaModule.java @@ -60,9 +60,7 @@ import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.renderer.Renderer; -import org.apache.tika.serialization.serdes.DefaultDetectorDeserializer; import org.apache.tika.serialization.serdes.DefaultDetectorSerializer; -import org.apache.tika.serialization.serdes.DefaultParserDeserializer; import org.apache.tika.serialization.serdes.DefaultParserSerializer; /** @@ -277,23 +275,24 @@ public class TikaModule extends SimpleModule { try { Object instance; - // Handle DefaultParser and DefaultDetector with dedicated deserializers + // DefaultParser and DefaultDetector must be loaded via TikaLoader for proper dependency injection if (clazz == DefaultParser.class) { - instance = deserializeWithNode(new DefaultParserDeserializer(), cleanedConfig, mapper); + throw new IOException("DefaultParser must be loaded via TikaLoader, not directly " + + "via Jackson deserialization. Use TikaLoader.load() to load configuration."); } else if (clazz == DefaultDetector.class) { - instance = deserializeWithNode(new DefaultDetectorDeserializer(), cleanedConfig, mapper); + throw new IOException("DefaultDetector must be loaded via TikaLoader, not directly " + + "via Jackson deserialization. Use TikaLoader.load() to load configuration."); } else if (cleanedConfig == null || cleanedConfig.isEmpty()) { // If no config, use default constructor instance = clazz.getDeclaredConstructor().newInstance(); } else if (SelfConfiguring.class.isAssignableFrom(clazz)) { - // SelfConfiguring components must use JsonConfig constructor + // SelfConfiguring components: prefer JsonConfig constructor if available Constructor<?> jsonConfigCtor = findJsonConfigConstructor(clazz); if (jsonConfigCtor != null) { String json = mapper.writeValueAsString(cleanedConfig); instance = jsonConfigCtor.newInstance((JsonConfig) () -> json); } else { - throw new IOException("SelfConfiguring component '" + typeName + - "' must have a constructor that accepts JsonConfig, but none was found."); + instance = clazz.getDeclaredConstructor().newInstance(); } } else { // Non-SelfConfiguring: use Jackson bean deserialization diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultDetectorDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultDetectorDeserializer.java deleted file mode 100644 index 97ce76029..000000000 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultDetectorDeserializer.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.serialization.serdes; - -import java.util.Collection; - -import org.apache.tika.config.ServiceLoader; -import org.apache.tika.detect.DefaultDetector; -import org.apache.tika.detect.Detector; -import org.apache.tika.mime.MimeTypes; - -/** - * Deserializer for DefaultDetector that handles exclusions. - * <p> - * Supports JSON formats: - * <pre> - * "default-detector" - * { "default-detector": {} } - * { "default-detector": { "exclude": ["html-detector", "zip-detector"] } } - * </pre> - */ -public class DefaultDetectorDeserializer extends SpiCompositeDeserializer<DefaultDetector> { - - @Override - @SuppressWarnings("unchecked") - protected DefaultDetector createInstance(Collection<Class<?>> excludedClasses) { - if (excludedClasses == null || excludedClasses.isEmpty()) { - return new DefaultDetector(); - } - return new DefaultDetector(MimeTypes.getDefaultMimeTypes(), - new ServiceLoader(), - (Collection<Class<? extends Detector>>) (Collection<?>) excludedClasses); - } -} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultParserDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultParserDeserializer.java deleted file mode 100644 index ffc491e61..000000000 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/DefaultParserDeserializer.java +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.serialization.serdes; - -import java.util.Collection; - -import org.apache.tika.config.ServiceLoader; -import org.apache.tika.mime.MediaTypeRegistry; -import org.apache.tika.parser.DefaultParser; -import org.apache.tika.parser.Parser; - -/** - * Deserializer for DefaultParser that handles exclusions. - * <p> - * Supports JSON formats: - * <pre> - * "default-parser" - * { "default-parser": {} } - * { "default-parser": { "exclude": ["html-parser", "xml-parser"] } } - * </pre> - */ -public class DefaultParserDeserializer extends SpiCompositeDeserializer<DefaultParser> { - - @Override - @SuppressWarnings("unchecked") - protected DefaultParser createInstance(Collection<Class<?>> excludedClasses) { - if (excludedClasses == null || excludedClasses.isEmpty()) { - return new DefaultParser(); - } - return new DefaultParser(MediaTypeRegistry.getDefaultRegistry(), - new ServiceLoader(), - (Collection<Class<? extends Parser>>) (Collection<?>) excludedClasses); - } -} diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/SpiCompositeDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/SpiCompositeDeserializer.java deleted file mode 100644 index 0218ca3d8..000000000 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/SpiCompositeDeserializer.java +++ /dev/null @@ -1,97 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.serialization.serdes; - -import java.io.IOException; -import java.util.Collection; -import java.util.HashSet; -import java.util.Set; - -import com.fasterxml.jackson.core.JsonParser; -import com.fasterxml.jackson.databind.DeserializationContext; -import com.fasterxml.jackson.databind.JsonDeserializer; -import com.fasterxml.jackson.databind.JsonNode; - -import org.apache.tika.serialization.ComponentNameResolver; - -/** - * Abstract base deserializer for SPI-loaded composite types that support exclusions. - * <p> - * Handles JSON like: - * <pre> - * { "exclude": ["html-detector", "zip-detector"] } - * </pre> - * or simply an empty object: - * <pre> - * {} - * </pre> - * <p> - * Note: The outer type wrapper (e.g., "default-detector") is handled by TikaModule. - * This deserializer receives just the inner config object. - * <p> - * Subclasses implement {@link #createInstance(Collection)} to create the appropriate - * composite type with the exclusions applied. - * - * @param <T> the composite type (e.g., DefaultDetector, DefaultParser) - */ -public abstract class SpiCompositeDeserializer<T> extends JsonDeserializer<T> { - - @Override - public T deserialize(JsonParser p, DeserializationContext ctxt) throws IOException { - JsonNode node = p.readValueAsTree(); - Collection<Class<?>> excludedClasses = parseExclusions(node); - return createInstance(excludedClasses); - } - - /** - * Parse exclusions from config node. - * Supports both "exclude" and "_exclude" field names. - */ - protected Collection<Class<?>> parseExclusions(JsonNode node) throws IOException { - Set<Class<?>> excludedClasses = new HashSet<>(); - - if (node == null || !node.isObject()) { - return excludedClasses; - } - - // Support both "exclude" and "_exclude" for compatibility - JsonNode excludeNode = node.has("exclude") ? node.get("exclude") : node.get("_exclude"); - - if (excludeNode != null && excludeNode.isArray()) { - for (JsonNode item : excludeNode) { - String typeName = item.asText(); - try { - Class<?> clazz = ComponentNameResolver.resolveClass(typeName, - Thread.currentThread().getContextClassLoader()); - excludedClasses.add(clazz); - } catch (ClassNotFoundException e) { - throw new IOException("Unknown type in exclude list: " + typeName, e); - } - } - } - - return excludedClasses; - } - - /** - * Create an instance of the composite type with the specified exclusions. - * - * @param excludedClasses classes to exclude from SPI loading - * @return the new instance - */ - protected abstract T createInstance(Collection<Class<?>> excludedClasses); -}
