This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4641 in repository https://gitbox.apache.org/repos/asf/tika.git
commit fe2967598d115f447f7df99fab6ec1c12e698181 Author: tallison <[email protected]> AuthorDate: Fri Jan 30 16:35:24 2026 -0500 TIKA-4641 -- simplify API and add duplicate detection --- .../ROOT/pages/advanced/setting-limits.adoc | 40 +++--- docs/modules/ROOT/pages/advanced/spooling.adoc | 11 +- .../ROOT/pages/configuration/digesters.adoc | 62 ++++----- .../org/apache/tika/config/loader/TikaLoader.java | 139 +++++++++------------ .../serdes/ParseContextDeserializer.java | 64 +++++++++- .../serdes/ParseContextSerializer.java | 14 +++ .../java/org/apache/tika/config/AllLimitsTest.java | 9 +- .../org/apache/tika/config/EmbeddedLimitsTest.java | 3 +- .../org/apache/tika/config/OutputLimitsTest.java | 3 +- .../org/apache/tika/config/TimeoutLimitsTest.java | 3 +- .../tika/config/loader/ConfigLoaderTest.java | 100 ++++++++------- .../apache/tika/config/loader/TikaLoaderTest.java | 47 +++++++ .../TestParseContextSerialization.java | 64 ++++++++++ 13 files changed, 360 insertions(+), 199 deletions(-) diff --git a/docs/modules/ROOT/pages/advanced/setting-limits.adoc b/docs/modules/ROOT/pages/advanced/setting-limits.adoc index 4dd02a4dd0..8064a4d88e 100644 --- a/docs/modules/ROOT/pages/advanced/setting-limits.adoc +++ b/docs/modules/ROOT/pages/advanced/setting-limits.adoc @@ -23,7 +23,7 @@ Tika provides several mechanisms for limiting resource usage during parsing. == Overview -Tika 4.x provides a unified configuration system for all limits through the `other-configs` +Tika 4.x provides a unified configuration system for all limits through the `parse-context` section of the JSON configuration file. All limits are loaded into the `ParseContext` and flow through the parsing pipeline. @@ -36,7 +36,7 @@ This is the same configuration tested in `AllLimitsTest.java`: ---- { "parsers": ["default-parser"], - "other-configs": { + "parse-context": { "embedded-limits": { "maxDepth": 10, "throwOnMaxDepth": false, @@ -54,13 +54,11 @@ This is the same configuration tested in `AllLimitsTest.java`: "timeout-limits": { "taskTimeoutMillis": 60000 }, - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "maxTotalBytes": 1048576, - "maxFieldSize": 102400, - "maxKeySize": 1024, - "maxValuesPerField": 100 - } + "standard-metadata-limiter-factory": { + "maxTotalBytes": 1048576, + "maxFieldSize": 102400, + "maxKeySize": 1024, + "maxValuesPerField": 100 } } } @@ -138,7 +136,7 @@ container.zip (depth 0) [source,json] ---- { - "other-configs": { + "parse-context": { "embedded-limits": { "maxDepth": 5, "throwOnMaxDepth": true, @@ -215,7 +213,7 @@ and protection against zip bombs. [source,json] ---- { - "other-configs": { + "parse-context": { "output-limits": { "writeLimit": 50000, "throwOnWriteLimit": true, @@ -263,7 +261,7 @@ The `TimeoutLimits` class controls time-based limits for parsing operations. [source,json] ---- { - "other-configs": { + "parse-context": { "timeout-limits": { "taskTimeoutMillis": 120000 } @@ -358,16 +356,14 @@ Use this to extract only the metadata you need. ---- { "parsers": ["default-parser"], - "other-configs": { - "metadata-write-limiter-factory": { - "standard-metadata-limiter-factory": { - "maxTotalBytes": 1048576, - "maxFieldSize": 102400, - "maxKeySize": 1024, - "maxValuesPerField": 100, - "includeFields": ["dc:title", "dc:creator", "dc:subject"], - "excludeFields": ["pdf:unmappedUnicodeCharsPerPage"] - } + "parse-context": { + "standard-metadata-limiter-factory": { + "maxTotalBytes": 1048576, + "maxFieldSize": 102400, + "maxKeySize": 1024, + "maxValuesPerField": 100, + "includeFields": ["dc:title", "dc:creator", "dc:subject"], + "excludeFields": ["pdf:unmappedUnicodeCharsPerPage"] } } } diff --git a/docs/modules/ROOT/pages/advanced/spooling.adoc b/docs/modules/ROOT/pages/advanced/spooling.adoc index 81d3bb18e4..29b30bf297 100644 --- a/docs/modules/ROOT/pages/advanced/spooling.adoc +++ b/docs/modules/ROOT/pages/advanced/spooling.adoc @@ -159,12 +159,12 @@ The default spool types are: === JSON Configuration SpoolingStrategy can be configured via JSON in your `tika-config.json` file. -Place the configuration in the `other-configs` section: +Place the configuration in the `parse-context` section: [source,json] ---- { - "other-configs": { + "parse-context": { "spooling-strategy": { "spoolTypes": [ "application/zip", @@ -181,11 +181,8 @@ Load the configuration using `TikaLoader`: [source,java] ---- TikaLoader loader = TikaLoader.load(Path.of("tika-config.json")); -SpoolingStrategy strategy = loader.configs().load(SpoolingStrategy.class); - -// Add to parse context -ParseContext context = new ParseContext(); -context.set(SpoolingStrategy.class, strategy); +ParseContext context = loader.loadParseContext(); +// SpoolingStrategy is automatically loaded into the ParseContext ---- === Best Practices diff --git a/docs/modules/ROOT/pages/configuration/digesters.adoc b/docs/modules/ROOT/pages/configuration/digesters.adoc index f09deb8446..b3e66f1dad 100644 --- a/docs/modules/ROOT/pages/configuration/digesters.adoc +++ b/docs/modules/ROOT/pages/configuration/digesters.adoc @@ -33,7 +33,7 @@ Tika provides two digester implementations: == JSON Configuration -Configure digesters in the `other-configs.digester-factory` section of your tika-config.json. +Configure digesters in the `parse-context` section of your tika-config.json. === Basic Example with CommonsDigester @@ -43,15 +43,13 @@ This example configures multiple digest algorithms: [source,json] ---- { - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "MD5" }, - { "algorithm": "SHA256" }, - { "algorithm": "SHA512" } - ] - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" }, + { "algorithm": "SHA256" }, + { "algorithm": "SHA512" } + ] } } } @@ -65,15 +63,13 @@ For SHA3 algorithms, use the BouncyCastle digester: [source,json] ---- { - "other-configs": { - "digester-factory": { - "bouncy-castle-digester-factory": { - "digests": [ - { "algorithm": "MD5" }, - { "algorithm": "SHA256" }, - { "algorithm": "SHA3_512" } - ] - } + "parse-context": { + "bouncy-castle-digester-factory": { + "digests": [ + { "algorithm": "MD5" }, + { "algorithm": "SHA256" }, + { "algorithm": "SHA3_512" } + ] } } } @@ -87,14 +83,12 @@ By default, digest values are encoded as lowercase hexadecimal. You can specify [source,json] ---- { - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "SHA256", "encoding": "BASE32" }, - { "algorithm": "MD5" } - ] - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "SHA256", "encoding": "BASE32" }, + { "algorithm": "MD5" } + ] } } } @@ -112,14 +106,12 @@ to `true`: [source,json] ---- { - "other-configs": { - "digester-factory": { - "commons-digester-factory": { - "digests": [ - { "algorithm": "MD5" } - ], - "skipContainerDocumentDigest": true - } + "parse-context": { + "commons-digester-factory": { + "digests": [ + { "algorithm": "MD5" } + ], + "skipContainerDocumentDigest": true } } } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index 44c3c2b4a5..36b0f69325 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -24,7 +24,6 @@ import java.util.ArrayList; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Set; import java.util.concurrent.ConcurrentHashMap; import com.fasterxml.jackson.core.StreamReadConstraints; @@ -58,6 +57,8 @@ import org.apache.tika.serialization.ComponentConfig; import org.apache.tika.serialization.ComponentNameResolver; import org.apache.tika.serialization.JsonMetadata; import org.apache.tika.serialization.JsonMetadataList; +import org.apache.tika.serialization.ParseContextUtils; +import org.apache.tika.serialization.serdes.ParseContextDeserializer; /** * Main entry point for loading Tika components from JSON configuration. @@ -381,8 +382,9 @@ public class TikaLoader { /** * Loads and returns a ParseContext populated with components from the "parse-context" section. * <p> - * This method only loads explicitly configured items from the JSON configuration. - * For loading with defaults for missing items, use {@link #loadParseContextWithDefaults()}. + * This method deserializes the parse-context JSON and resolves all component references + * using the component registry. Components are looked up by their friendly names + * (e.g., "embedded-limits", "pdf-parser-config") and deserialized to their appropriate types. * <p> * Use this method when you need a pre-configured ParseContext for parsing operations. * @@ -399,106 +401,79 @@ public class TikaLoader { * @throws TikaConfigException if loading fails */ public ParseContext loadParseContext() throws TikaConfigException { - return loadParseContextInternal(false); + JsonNode parseContextNode = config.getRootNode().get("parse-context"); + if (parseContextNode == null) { + return new ParseContext(); + } + try { + ParseContext context = + ParseContextDeserializer.readParseContext(parseContextNode, objectMapper); + ParseContextUtils.resolveAll(context, classLoader); + return context; + } catch (IOException e) { + throw new TikaConfigException("Failed to load parse-context", e); + } } /** - * Loads and returns a ParseContext populated with components from the "parse-context" section, - * plus default implementations for any missing items. + * Loads a configuration object from the "parse-context" section, merging with defaults. + * <p> + * This method is useful when you have a base configuration (e.g., from code defaults or + * a previous load) and want to overlay values from the JSON config. Properties not + * specified in the JSON retain their default values. * <p> - * This method loads explicitly configured items from JSON, then instantiates - * default implementations (marked with {@code @TikaComponent(defaultFor=...)}) - * for any interface that wasn't explicitly configured. + * The original defaults object is NOT modified - a new instance is returned. + * + * <p>Example usage for PDFParserConfig: + * <pre> + * // Load base config from tika-config.json at init time + * TikaLoader loader = TikaLoader.load(configPath); + * PDFParserConfig baseConfig = loader.loadConfig(PDFParserConfig.class, new PDFParserConfig()); + * + * // At runtime, create per-request overrides + * PDFParserConfig requestConfig = new PDFParserConfig(); + * requestConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + * + * // Merge: base config values + request overrides + * // (Note: for runtime merging, use JsonMergeUtils directly or loadConfig on a runtime loader) + * </pre> * - * @return a ParseContext populated with configured and default components + * @param clazz the class to deserialize into + * @param defaults the default values to use for properties not in the JSON config + * @param <T> the configuration type + * @return a new instance with defaults merged with JSON config, or the original defaults if not configured * @throws TikaConfigException if loading fails */ - public ParseContext loadParseContextWithDefaults() throws TikaConfigException { - return loadParseContextInternal(true); + public <T> T loadConfig(Class<T> clazz, T defaults) throws TikaConfigException { + return configs().loadWithDefaults(clazz, defaults); } /** - * Internal method to load ParseContext with optional defaults. + * Loads a configuration object from the "parse-context" section by explicit key, merging with defaults. + * <p> + * This method is useful when the JSON key doesn't match the class name's kebab-case conversion, + * or when you want to load from a specific key. * - * @param includeDefaults whether to include default implementations for missing items - * @return a ParseContext populated with components + * @param key the JSON key in the "parse-context" section + * @param clazz the class to deserialize into + * @param defaults the default values to use for properties not in the JSON config + * @param <T> the configuration type + * @return a new instance with defaults merged with JSON config, or the original defaults if not configured * @throws TikaConfigException if loading fails */ - private ParseContext loadParseContextInternal(boolean includeDefaults) throws TikaConfigException { - ParseContext context = new ParseContext(); - Set<Class<?>> configuredKeys = new HashSet<>(); - - // Load the component registry for parse-context - ComponentRegistry registry; - try { - registry = new ComponentRegistry("parse-context", classLoader); - } catch (TikaConfigException e) { - // parse-context.idx might not exist yet (e.g., first build) - // In that case, just return an empty context - return context; - } - - // Load explicitly configured items from JSON - JsonNode parseContextNode = config.getRootNode().get("parse-context"); - if (parseContextNode != null && parseContextNode.isObject()) { - java.util.Iterator<String> fieldNames = parseContextNode.fieldNames(); - while (fieldNames.hasNext()) { - String key = fieldNames.next(); - JsonNode valueNode = parseContextNode.get(key); - - try { - ComponentInfo info = registry.getComponentInfo(key); - Class<?> targetClass = info.componentClass(); - Class<?> contextKey = info.contextKey() != null ? info.contextKey() : targetClass; - - Object instance = objectMapper.treeToValue(valueNode, targetClass); - context.set((Class<Object>) contextKey, instance); - configuredKeys.add(contextKey); - } catch (TikaConfigException e) { - throw new TikaConfigException("Failed to load parse-context item: " + key, e); - } catch (Exception e) { - throw new TikaConfigException("Failed to deserialize parse-context item: " + key, e); - } - } - } - - // Add defaults for missing items (if requested) - if (includeDefaults) { - for (Map.Entry<String, ComponentInfo> entry : registry.getDefaultComponents().entrySet()) { - ComponentInfo info = entry.getValue(); - Class<?> contextKey = info.contextKey() != null ? info.contextKey() : info.componentClass(); - - if (!configuredKeys.contains(contextKey)) { - try { - Object instance = info.componentClass().getDeclaredConstructor().newInstance(); - context.set((Class<Object>) contextKey, instance); - } catch (ReflectiveOperationException e) { - throw new TikaConfigException( - "Failed to instantiate default component: " + info.componentClass().getName(), e); - } - } - } - } - - return context; + public <T> T loadConfig(String key, Class<T> clazz, T defaults) throws TikaConfigException { + return configs().loadWithDefaults(key, clazz, defaults); } /** * Returns a ConfigLoader for loading simple configuration objects. * <p> - * Use this for POJOs and simple config classes. For complex components like - * Parsers, Detectors, etc., use the specific load methods on TikaLoader. - * - * <p>Usage: - * <pre> - * MyConfig config = loader.configs().load("my-config", MyConfig.class); - * // Or use kebab-case auto-conversion: - * MyConfig config = loader.configs().load(MyConfig.class); - * </pre> + * This is internal - external code should use {@link #loadParseContext()} or + * {@link #loadConfig(Class, Object)} instead. * * @return the ConfigLoader instance */ - public synchronized ConfigLoader configs() { + private synchronized ConfigLoader configs() { if (configLoader == null) { configLoader = new ConfigLoader(config, objectMapper); } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index 2e2b4dec7d..3cc05f3272 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -20,7 +20,9 @@ import static org.apache.tika.serialization.serdes.ParseContextSerializer.PARSE_ import static org.apache.tika.serialization.serdes.ParseContextSerializer.TYPED; import java.io.IOException; +import java.util.HashMap; import java.util.Iterator; +import java.util.Map; import java.util.Optional; import com.fasterxml.jackson.core.JsonParser; @@ -75,11 +77,15 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { * <p> * The "typed" section is deserialized directly to typed objects in the context map. * All other fields are stored as JSON config strings for lazy resolution. + * <p> + * Duplicate detection is performed within a single document: if multiple entries + * resolve to the same context key (e.g., both "bouncy-castle-digester" and + * "commons-digester" resolve to DigesterFactory), an IOException is thrown. * * @param jsonNode the JSON node containing the ParseContext data * @param mapper the ObjectMapper for deserializing typed objects * @return the deserialized ParseContext - * @throws IOException if deserialization fails + * @throws IOException if deserialization fails or duplicate context keys are detected */ public static ParseContext readParseContext(JsonNode jsonNode, ObjectMapper mapper) throws IOException { @@ -95,6 +101,10 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } + // Track context keys to detect duplicates within this document + // Maps contextKey -> friendlyName for error messages + Map<Class<?>, String> seenContextKeys = new HashMap<>(); + Iterator<String> fieldNames = contextNode.fieldNames(); while (fieldNames.hasNext()) { String name = fieldNames.next(); @@ -102,8 +112,11 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { if (TYPED.equals(name)) { // Deserialize typed objects directly to context map - deserializeTypedObjects(value, parseContext, mapper); + deserializeTypedObjects(value, parseContext, mapper, seenContextKeys); } else { + // Check for duplicate context key before storing + checkForDuplicateContextKey(name, seenContextKeys); + // Store as JSON config for lazy resolution // Use plain JSON mapper since the main mapper may be binary (Smile) String json = JSON_MAPPER.writeValueAsString(value); @@ -114,12 +127,49 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } + /** + * Checks if a JSON config entry would create a duplicate context key. + * <p> + * Looks up the friendly name in the component registry to determine its context key, + * then checks if that key has already been seen in this document. + * + * @param friendlyName the friendly name of the config entry + * @param seenContextKeys map of already-seen context keys to their friendly names + * @throws IOException if a duplicate context key is detected + */ + private static void checkForDuplicateContextKey(String friendlyName, + Map<Class<?>, String> seenContextKeys) + throws IOException { + Optional<ComponentInfo> infoOpt = ComponentNameResolver.getComponentInfo(friendlyName); + if (infoOpt.isEmpty()) { + // Not a registered component - can't check for duplicates, that's okay + return; + } + + ComponentInfo info = infoOpt.get(); + Class<?> contextKey = info.contextKey() != null ? info.contextKey() : info.componentClass(); + + String existingName = seenContextKeys.get(contextKey); + if (existingName != null) { + throw new IOException("Duplicate parse-context entries resolve to the same key " + + contextKey.getName() + ": '" + existingName + "' and '" + friendlyName + "'"); + } + seenContextKeys.put(contextKey, friendlyName); + } + /** * Deserializes the "typed" section into typed objects in the context map. + * + * @param typedNode the JSON node containing typed objects + * @param parseContext the ParseContext to add objects to + * @param mapper the ObjectMapper for deserializing + * @param seenContextKeys map tracking context keys to their friendly names (for duplicate detection) + * @throws IOException if deserialization fails or duplicate context keys are detected */ @SuppressWarnings("unchecked") private static void deserializeTypedObjects(JsonNode typedNode, ParseContext parseContext, - ObjectMapper mapper) throws IOException { + ObjectMapper mapper, + Map<Class<?>, String> seenContextKeys) throws IOException { if (!typedNode.isObject()) { return; } @@ -158,6 +208,14 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { // Use contextKey if available, otherwise use the config class itself Class<?> parseContextKey = (contextKeyClass != null) ? contextKeyClass : configClass; + // Check for duplicate context key + String existingName = seenContextKeys.get(parseContextKey); + if (existingName != null) { + throw new IOException("Duplicate parse-context entries resolve to the same key " + + parseContextKey.getName() + ": '" + existingName + "' and '" + componentName + "'"); + } + seenContextKeys.put(parseContextKey, componentName); + // Deserialize and add to context try { Object config = mapper.treeToValue(configNode, configClass); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java index cf73ad8129..643d76f5c2 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java @@ -17,7 +17,9 @@ package org.apache.tika.serialization.serdes; import java.io.IOException; +import java.util.HashSet; import java.util.Map; +import java.util.Set; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonSerializer; @@ -62,6 +64,10 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { SerializerProvider serializers) throws IOException { gen.writeStartObject(); + // Track which friendly names have been serialized under "typed" + // so we can skip them when serializing jsonConfigs (avoid duplicates) + Set<String> serializedNames = new HashSet<>(); + // First, serialize typed objects from the context map under "typed" key Map<String, Object> contextMap = parseContext.getContextMap(); boolean hasTypedObjects = false; @@ -94,6 +100,9 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { // Use writeTree instead of writeRawValue for binary format support (e.g., Smile) // and stricter validation (fails early if value can't be serialized) gen.writeTree(PLAIN_MAPPER.valueToTree(value)); + + // Track this name so we skip it in jsonConfigs + serializedNames.add(keyName); } if (hasTypedObjects) { @@ -101,8 +110,13 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { } // Then, serialize JSON configs at the top level + // Skip entries that were already serialized under "typed" (they've been resolved) Map<String, JsonConfig> jsonConfigs = parseContext.getJsonConfigs(); for (Map.Entry<String, JsonConfig> entry : jsonConfigs.entrySet()) { + if (serializedNames.contains(entry.getKey())) { + // Already serialized under "typed", skip to avoid duplicate + continue; + } gen.writeFieldName(entry.getKey()); // Parse the JSON string into a tree for binary format support gen.writeTree(PLAIN_MAPPER.readTree(entry.getValue().json())); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java index 2dd10b532f..b509d03bab 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/AllLimitsTest.java @@ -105,17 +105,18 @@ public class AllLimitsTest extends TikaTest { @Test public void testLoadIndividualLimits() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "all-limits-test.json")); + ParseContext context = loader.loadParseContext(); - // Load individual limit configs directly - EmbeddedLimits embeddedLimits = loader.configs().load(EmbeddedLimits.class); + // Load individual limit configs from ParseContext + EmbeddedLimits embeddedLimits = context.get(EmbeddedLimits.class); assertNotNull(embeddedLimits); assertEquals(10, embeddedLimits.getMaxDepth()); - OutputLimits outputLimits = loader.configs().load(OutputLimits.class); + OutputLimits outputLimits = context.get(OutputLimits.class); assertNotNull(outputLimits); assertEquals(100000, outputLimits.getWriteLimit()); - TimeoutLimits timeoutLimits = loader.configs().load(TimeoutLimits.class); + TimeoutLimits timeoutLimits = context.get(TimeoutLimits.class); assertNotNull(timeoutLimits); assertEquals(60000, timeoutLimits.getTaskTimeoutMillis()); } diff --git a/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java index f362271fb3..4bb4873701 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/EmbeddedLimitsTest.java @@ -32,7 +32,8 @@ public class EmbeddedLimitsTest extends TikaTest { @Test public void testLoadFromConfig() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "embedded-limits-test.json")); - EmbeddedLimits limits = loader.configs().load(EmbeddedLimits.class); + ParseContext context = loader.loadParseContext(); + EmbeddedLimits limits = context.get(EmbeddedLimits.class); assertNotNull(limits); assertEquals(5, limits.getMaxDepth()); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/OutputLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/OutputLimitsTest.java index 6e543a085d..41e23cd570 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/OutputLimitsTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/OutputLimitsTest.java @@ -32,7 +32,8 @@ public class OutputLimitsTest extends TikaTest { @Test public void testLoadFromConfig() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "output-limits-test.json")); - OutputLimits limits = loader.configs().load(OutputLimits.class); + ParseContext context = loader.loadParseContext(); + OutputLimits limits = context.get(OutputLimits.class); assertNotNull(limits); assertEquals(50000, limits.getWriteLimit()); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java b/tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java index 24c15af7ab..4217349134 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/TimeoutLimitsTest.java @@ -32,7 +32,8 @@ public class TimeoutLimitsTest extends TikaTest { @Test public void testLoadFromConfig() throws Exception { TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "timeout-limits-test.json")); - TimeoutLimits limits = loader.configs().load(TimeoutLimits.class); + ParseContext context = loader.loadParseContext(); + TimeoutLimits limits = context.get(TimeoutLimits.class); assertNotNull(limits); assertEquals(120000, limits.getTaskTimeoutMillis()); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java index 12695472c8..5efa4136ac 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/ConfigLoaderTest.java @@ -27,6 +27,7 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.nio.file.Path; import java.nio.file.Paths; +import com.fasterxml.jackson.databind.ObjectMapper; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -39,15 +40,16 @@ import org.apache.tika.mime.MediaType; */ public class ConfigLoaderTest { - private TikaLoader tikaLoader; + private TikaJsonConfig tikaJsonConfig; private ConfigLoader configLoader; @BeforeEach public void setUp() throws Exception { Path configPath = Paths.get( getClass().getResource("/configs/test-config-loader.json").toURI()); - tikaLoader = TikaLoader.load(configPath); - configLoader = tikaLoader.configs(); + tikaJsonConfig = TikaJsonConfig.load(configPath); + ObjectMapper objectMapper = TikaObjectMapperFactory.getMapper(); + configLoader = new ConfigLoader(tikaJsonConfig, objectMapper); } // ==================== Test POJOs ==================== @@ -286,10 +288,11 @@ public class ConfigLoaderTest { // because Jackson can't instantiate interfaces directly Path configPath = Paths.get( getClass().getResource("/configs/test-interface-no-type.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("handler-no-type", TestHandler.class)); + loader.load("handler-no-type", TestHandler.class)); assertTrue(ex.getMessage().contains("Failed to deserialize")); } @@ -337,10 +340,11 @@ public class ConfigLoaderTest { public void testLoadInvalidClassName() throws Exception { Path configPath = Paths.get( getClass().getResource("/configs/test-invalid-class.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("handler", TestHandler.class)); + loader.load("handler", TestHandler.class)); assertTrue(ex.getMessage().contains("Class not found")); } @@ -350,10 +354,11 @@ public class ConfigLoaderTest { // String class name that doesn't implement the interface Path configPath = Paths.get( getClass().getResource("/configs/test-wrong-type.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("handler", TestHandler.class)); + loader.load("handler", TestHandler.class)); assertTrue(ex.getMessage().contains("not assignable")); } @@ -363,10 +368,11 @@ public class ConfigLoaderTest { // Verify that unexpected/unrecognized fields cause an exception Path configPath = Paths.get( getClass().getResource("/configs/test-unexpected-field.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); TikaConfigException ex = assertThrows(TikaConfigException.class, () -> - loader.configs().load("retry-config", RetryConfig.class)); + loader.load("retry-config", RetryConfig.class)); // Should contain information about the unrecognized field assertTrue(ex.getMessage().contains("retry-config") || @@ -407,7 +413,8 @@ public class ConfigLoaderTest { // Load config that merges defaults with partial JSON Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); // Set up defaults RetryConfig defaults = new RetryConfig(); @@ -416,14 +423,14 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // JSON only has: { "enabled": true } - RetryConfig config = loader.configs().loadWithDefaults("retry-config", + RetryConfig result = loader.loadWithDefaults("retry-config", RetryConfig.class, defaults); - assertNotNull(config); - assertEquals(30000, config.getTimeout()); // ✅ From defaults - assertEquals(2, config.getRetries()); // ✅ From defaults - assertTrue(config.isEnabled()); // ✅ From JSON (overridden) + assertNotNull(result); + assertEquals(30000, result.getTimeout()); // ✅ From defaults + assertEquals(2, result.getRetries()); // ✅ From defaults + assertTrue(result.isEnabled()); // ✅ From JSON (overridden) } @Test @@ -431,7 +438,8 @@ public class ConfigLoaderTest { // Test that JSON can override all defaults Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); @@ -439,14 +447,14 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // JSON has: { "timeout": 10000, "retries": 5, "enabled": false } - RetryConfig config = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig result = loader.loadWithDefaults("retry-config-full", RetryConfig.class, defaults); - assertNotNull(config); - assertEquals(10000, config.getTimeout()); // All overridden - assertEquals(5, config.getRetries()); - assertFalse(config.isEnabled()); + assertNotNull(result); + assertEquals(10000, result.getTimeout()); // All overridden + assertEquals(5, result.getRetries()); + assertFalse(result.isEnabled()); } @Test @@ -472,7 +480,8 @@ public class ConfigLoaderTest { // Test the class-name version Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); @@ -480,12 +489,12 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // Uses kebab-case: RetryConfig -> "retry-config" - RetryConfig config = loader.configs().loadWithDefaults(RetryConfig.class, defaults); + RetryConfig result = loader.loadWithDefaults(RetryConfig.class, defaults); - assertNotNull(config); - assertEquals(30000, config.getTimeout()); - assertEquals(2, config.getRetries()); - assertTrue(config.isEnabled()); // Overridden from JSON + assertNotNull(result); + assertEquals(30000, result.getTimeout()); + assertEquals(2, result.getRetries()); + assertTrue(result.isEnabled()); // Overridden from JSON } @Test @@ -493,7 +502,8 @@ public class ConfigLoaderTest { // Demonstrate difference between load() and loadWithDefaults() Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); @@ -501,13 +511,13 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // Using load() - creates new object, loses defaults - RetryConfig config1 = loader.configs().load("retry-config", RetryConfig.class); + RetryConfig config1 = loader.load("retry-config", RetryConfig.class); assertEquals(0, config1.getTimeout()); // ❌ Lost default! assertEquals(0, config1.getRetries()); // ❌ Lost default! assertTrue(config1.isEnabled()); // ✅ From JSON // Using loadWithDefaults() - merges into defaults - RetryConfig config2 = loader.configs().loadWithDefaults("retry-config", + RetryConfig config2 = loader.loadWithDefaults("retry-config", RetryConfig.class, defaults); assertEquals(30000, config2.getTimeout()); // ✅ Kept default! @@ -522,7 +532,8 @@ public class ConfigLoaderTest { // Verify that the original defaults object is NOT modified Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); @@ -530,7 +541,7 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // Load config with partial override (JSON only has "enabled": true) - RetryConfig result = loader.configs().loadWithDefaults("retry-config", + RetryConfig result = loader.loadWithDefaults("retry-config", RetryConfig.class, defaults); @@ -555,7 +566,8 @@ public class ConfigLoaderTest { // Verify defaults can be safely reused for multiple loads Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); RetryConfig defaults = new RetryConfig(); defaults.setTimeout(30000); @@ -563,10 +575,10 @@ public class ConfigLoaderTest { defaults.setEnabled(false); // Load multiple times with same defaults - RetryConfig config1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig config1 = loader.loadWithDefaults("retry-config", RetryConfig.class, defaults); - RetryConfig config2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig config2 = loader.loadWithDefaults("retry-config-full", RetryConfig.class, defaults); @@ -580,7 +592,7 @@ public class ConfigLoaderTest { assertFalse(defaults.isEnabled()); // Use defaults one more time - RetryConfig config3 = loader.configs().loadWithDefaults("non-existent", + RetryConfig config3 = loader.loadWithDefaults("non-existent", RetryConfig.class, defaults); assertEquals(defaults, config3); // Should return original when key missing @@ -591,13 +603,14 @@ public class ConfigLoaderTest { // Test with nested/complex objects to ensure deep copy works Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); TikaTaskTimeout defaults = new TikaTaskTimeout(); defaults.setMillis(60000); // Note: tika-task-timeout in JSON has millis: 30000 - TikaTaskTimeout result = loader.configs().loadWithDefaults("tika-task-timeout", + TikaTaskTimeout result = loader.loadWithDefaults("tika-task-timeout", TikaTaskTimeout.class, defaults); @@ -632,7 +645,8 @@ public class ConfigLoaderTest { // Demonstrate that defaults can be safely shared across threads Path configPath = Paths.get( getClass().getResource("/configs/test-partial-config.json").toURI()); - TikaLoader loader = TikaLoader.load(configPath); + TikaJsonConfig config = TikaJsonConfig.load(configPath); + ConfigLoader loader = new ConfigLoader(config, TikaObjectMapperFactory.getMapper()); // Shared defaults object RetryConfig sharedDefaults = new RetryConfig(); @@ -641,10 +655,10 @@ public class ConfigLoaderTest { sharedDefaults.setEnabled(false); // Simulate concurrent usage (not a real concurrency test, just demonstrates safety) - RetryConfig result1 = loader.configs().loadWithDefaults("retry-config", + RetryConfig result1 = loader.loadWithDefaults("retry-config", RetryConfig.class, sharedDefaults); - RetryConfig result2 = loader.configs().loadWithDefaults("retry-config-full", + RetryConfig result2 = loader.loadWithDefaults("retry-config-full", RetryConfig.class, sharedDefaults); diff --git a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java index 5b8e44c788..403464d9c4 100644 --- a/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/config/loader/TikaLoaderTest.java @@ -30,6 +30,7 @@ import org.junit.jupiter.api.Disabled; import org.junit.jupiter.api.Test; import org.xml.sax.helpers.DefaultHandler; +import org.apache.tika.config.EmbeddedLimits; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; @@ -322,6 +323,52 @@ public class TikaLoaderTest { "Should NOT support application/test+optin (opt-in only, not in SPI)"); } + @Test + public void testLoadConfigWithDefaults() throws Exception { + // Test the loadConfig method that merges JSON config with defaults + URL configUrl = getClass().getResource("/configs/embedded-limits-test.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + + // Create defaults - some values will be overridden by JSON, others kept + EmbeddedLimits defaults = new EmbeddedLimits(); + // Default values from EmbeddedLimits: maxDepth=UNLIMITED, maxCount=UNLIMITED, throwOnMax*=false + + // Load with defaults - JSON has: maxDepth=5, throwOnMaxDepth=true, maxCount=100, throwOnMaxCount=false + EmbeddedLimits config = loader.loadConfig(EmbeddedLimits.class, defaults); + + assertNotNull(config, "Config should not be null"); + assertEquals(5, config.getMaxDepth(), "maxDepth should be from JSON"); + assertTrue(config.isThrowOnMaxDepth(), "throwOnMaxDepth should be from JSON"); + assertEquals(100, config.getMaxCount(), "maxCount should be from JSON"); + assertFalse(config.isThrowOnMaxCount(), "throwOnMaxCount should be from JSON"); + + // Verify original defaults object was NOT modified + assertEquals(EmbeddedLimits.UNLIMITED, defaults.getMaxDepth(), "Original defaults should be unchanged"); + } + + @Test + public void testLoadConfigMissingKeyReturnsDefaults() throws Exception { + // Test that loadConfig returns defaults when key is not in config + URL configUrl = getClass().getResource("/configs/test-loader-config.json"); + Path configPath = Path.of(configUrl.toURI()); + + TikaLoader loader = TikaLoader.load(configPath); + + // Create defaults + EmbeddedLimits defaults = new EmbeddedLimits(10, true, 500, false); + + // Load with defaults - this config doesn't have embedded-limits + EmbeddedLimits config = loader.loadConfig(EmbeddedLimits.class, defaults); + + // Should return the defaults since key is missing + assertEquals(10, config.getMaxDepth(), "Should return defaults when key missing"); + assertTrue(config.isThrowOnMaxDepth(), "Should return defaults when key missing"); + assertEquals(500, config.getMaxCount(), "Should return defaults when key missing"); + assertFalse(config.isThrowOnMaxCount(), "Should return defaults when key missing"); + } + // TODO: TIKA-SERIALIZATION-FOLLOWUP - Jackson may need configuration to fail on unknown properties @Disabled("TIKA-SERIALIZATION-FOLLOWUP") @Test diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index dcd99f1b01..f22ae013ce 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -19,6 +19,7 @@ package org.apache.tika.serialization; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.StringWriter; @@ -429,4 +430,67 @@ public class TestParseContextSerialization { assertEquals(10000, basicFactory.getWriteLimit()); assertFalse(basicFactory.isThrowOnWriteLimitReached()); } + + /** + * Test that duplicate context keys within a single JSON document are detected and rejected. + * Both BasicContentHandlerFactory and UppercasingContentHandlerFactory resolve to + * ContentHandlerFactory.class as their context key, so configuring both should fail. + */ + @Test + public void testDuplicateContextKeyDetection() throws Exception { + // Both of these resolve to ContentHandlerFactory.class as the context key + String json = """ + { + "basic-content-handler-factory": { + "type": "XML", + "writeLimit": 50000 + }, + "uppercasing-content-handler-factory": {} + } + """; + + ObjectMapper mapper = createMapper(); + + // Should throw an exception due to duplicate context key + Exception ex = assertThrows(Exception.class, () -> + mapper.readValue(json, ParseContext.class)); + + // Verify the error message mentions the duplicate + assertTrue(ex.getMessage().contains("Duplicate") || + (ex.getCause() != null && ex.getCause().getMessage().contains("Duplicate")), + "Exception should mention duplicate context key: " + ex.getMessage()); + assertTrue(ex.getMessage().contains("ContentHandlerFactory") || + (ex.getCause() != null && ex.getCause().getMessage().contains("ContentHandlerFactory")), + "Exception should mention the conflicting key: " + ex.getMessage()); + } + + /** + * Test that a single component per context key is allowed (no false positives). + */ + @Test + public void testNoDuplicateWhenDifferentContextKeys() throws Exception { + // These have different context keys, so both should be allowed + String json = """ + { + "basic-content-handler-factory": { + "type": "TEXT", + "writeLimit": 10000 + }, + "skip-embedded-document-selector": {} + } + """; + + ObjectMapper mapper = createMapper(); + ParseContext deserialized = mapper.readValue(json, ParseContext.class); + + // Both should be present as JSON configs + assertTrue(deserialized.hasJsonConfig("basic-content-handler-factory")); + assertTrue(deserialized.hasJsonConfig("skip-embedded-document-selector")); + + // Resolve and verify both work + ParseContextUtils.resolveAll(deserialized, Thread.currentThread().getContextClassLoader()); + + assertNotNull(deserialized.get(ContentHandlerFactory.class)); + assertNotNull(deserialized.get(DocumentSelector.class)); + } }
