This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4851-revert in repository https://gitbox.apache.org/repos/asf/tika.git
commit 15d62208e54599173255b1a9bf03873904473ef9 Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Dec 19 13:38:25 2025 -0600 Revert "TIKA-4581 - Round trip pojos that exist in the registry in the ParseContext (#2463)" This reverts commit 2972d0fcd69c3d4de24fccaebc09de304efaf558. --- .../main/resources/META-INF/tika/other-configs.idx | 5 - tika-pipes/tika-pipes-core/pom.xml | 13 --- .../extractor/EmbeddedDocumentBytesConfig.java | 3 - .../tika/config/loader/ComponentRegistry.java | 36 ++++++- .../tika/serialization/ComponentNameResolver.java | 36 ------- .../serialization/ParseContextDeserializer.java | 45 +++++++-- .../tika/serialization/ParseContextSerializer.java | 50 +++++---- .../tika/serialization/TikaAbstractTypeMixins.java | 112 --------------------- .../TestParseContextSerialization.java | 23 +---- 9 files changed, 102 insertions(+), 221 deletions(-) diff --git a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx b/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx deleted file mode 100644 index 0c9f7d254..000000000 --- a/tika-pipes/tika-pipes-api/src/main/resources/META-INF/tika/other-configs.idx +++ /dev/null @@ -1,5 +0,0 @@ -# Component registry for tika-pipes-api -# Format: friendly-name=fully.qualified.ClassName -# this has to be manually generated for now because of the dependency graph - -handler-config=org.apache.tika.pipes.api.HandlerConfig diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-pipes-core/pom.xml index b5c873221..fbebcdf3a 100644 --- a/tika-pipes/tika-pipes-core/pom.xml +++ b/tika-pipes/tika-pipes-core/pom.xml @@ -98,19 +98,6 @@ </archive> </configuration> </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-compiler-plugin</artifactId> - <configuration> - <annotationProcessorPaths> - <path> - <groupId>org.apache.tika</groupId> - <artifactId>tika-annotation-processor</artifactId> - <version>${project.version}</version> - </path> - </annotationProcessorPaths> - </configuration> - </plugin> </plugins> </build> </project> diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java index c02b78067..6a449b5bf 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/extractor/EmbeddedDocumentBytesConfig.java @@ -19,9 +19,6 @@ package org.apache.tika.pipes.core.extractor; import java.io.Serializable; import java.util.Objects; -import org.apache.tika.config.TikaComponent; - -@TikaComponent(name = "embedded-document-bytes-config") public class EmbeddedDocumentBytesConfig implements Serializable { /** diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java index 700d93761..ce3593f0c 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/ComponentRegistry.java @@ -43,11 +43,27 @@ import org.apache.tika.exception.TikaConfigException; * <li>Optional explicit context key for ParseContext</li> * </ul> * <p> - * Modules that can't use @TikaComponent (due to dependency constraints) can provide - * their own META-INF/tika/*.idx files to register components. + * Also includes built-in aliases for external dependencies that cannot be + * annotated with @TikaComponent. */ public class ComponentRegistry { + /** + * Built-in aliases for external dependencies. + * Maps component names to fully qualified class names. + */ + private static final Map<String, String> BUILTIN_ALIASES = createBuiltinAliases(); + + private static Map<String, String> createBuiltinAliases() { + Map<String, String> aliases = new HashMap<>(); + // HandlerConfig is in tika-pipes-api which can't depend on tika-core for @TikaComponent + aliases.put("handler-config", "org.apache.tika.pipes.api.HandlerConfig"); + // EmbeddedDocumentBytesConfig is in tika-pipes-core which can't depend on tika-core for @TikaComponent + aliases.put("embedded-document-bytes-config", + "org.apache.tika.pipes.core.extractor.EmbeddedDocumentBytesConfig"); + return Collections.unmodifiableMap(aliases); + } + private final Map<String, ComponentInfo> components; private final Map<Class<?>, String> classToName; // Reverse lookup private final ClassLoader classLoader; @@ -149,9 +165,25 @@ public class ComponentRegistry { throw new TikaConfigException("Failed to load component index: " + resourcePath, e); } + // Load built-in aliases for external dependencies + loadBuiltinAliases(result); + return result; } + private void loadBuiltinAliases(Map<String, ComponentInfo> result) { + for (Map.Entry<String, String> alias : BUILTIN_ALIASES.entrySet()) { + try { + Class<?> clazz = Class.forName(alias.getValue(), false, classLoader); + boolean selfConfiguring = SelfConfiguring.class.isAssignableFrom(clazz); + result.put(alias.getKey(), new ComponentInfo(clazz, selfConfiguring, null)); + } catch (ClassNotFoundException e) { + // External dependency not on classpath - skip this alias + // This is expected behavior, not an error + } + } + } + private void loadFromUrl(URL url, Map<String, ComponentInfo> result) throws TikaConfigException { try (InputStream in = url.openStream(); BufferedReader reader = new BufferedReader( diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java index 0b7d9a700..739ed9944 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ComponentNameResolver.java @@ -17,10 +17,8 @@ package org.apache.tika.serialization; import java.util.Map; -import java.util.Optional; import java.util.concurrent.ConcurrentHashMap; -import org.apache.tika.config.loader.ComponentInfo; import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.exception.TikaConfigException; @@ -87,38 +85,4 @@ public final class ComponentNameResolver { } return null; } - - /** - * Checks if a component with the given name is registered in any registry. - * - * @param name the component name to check - * @return true if the component is registered - */ - public static boolean hasComponent(String name) { - for (ComponentRegistry registry : REGISTRIES.values()) { - if (registry.hasComponent(name)) { - return true; - } - } - return false; - } - - /** - * Gets the component info for a given friendly name. - * - * @param name the friendly name to look up - * @return Optional containing the ComponentInfo, or empty if not found - */ - public static Optional<ComponentInfo> getComponentInfo(String name) { - for (ComponentRegistry registry : REGISTRIES.values()) { - if (registry.hasComponent(name)) { - try { - return Optional.of(registry.getComponentInfo(name)); - } catch (TikaConfigException e) { - // continue to next registry - } - } - } - return Optional.empty(); - } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java index d57578677..d5d9fc601 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextDeserializer.java @@ -20,7 +20,6 @@ import static org.apache.tika.serialization.ParseContextSerializer.PARSE_CONTEXT import java.io.IOException; import java.util.Iterator; -import java.util.Optional; import com.fasterxml.jackson.core.JacksonException; import com.fasterxml.jackson.core.JsonParser; @@ -34,7 +33,9 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; import org.apache.tika.config.SelfConfiguring; import org.apache.tika.config.loader.ComponentInfo; +import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.config.loader.TikaObjectMapperFactory; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; /** @@ -56,6 +57,25 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { private static final Logger LOG = LoggerFactory.getLogger(ParseContextDeserializer.class); private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); + // Lazily loaded registry for looking up friendly names + private static volatile ComponentRegistry registry; + + private static ComponentRegistry getRegistry() { + if (registry == null) { + synchronized (ParseContextDeserializer.class) { + if (registry == null) { + try { + registry = new ComponentRegistry("other-configs", + ParseContextDeserializer.class.getClassLoader()); + } catch (TikaConfigException e) { + LOG.warn("Failed to load component registry for deserialization", e); + } + } + } + } + return registry; + } + @Override public ParseContext deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) @@ -108,19 +128,22 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { } // If not found as FQCN, check registry for friendly name - // Use ComponentNameResolver to ensure consistency with TikaObjectMapperFactory's registries boolean isSelfConfiguring = false; Class<?> contextKey = null; // The key to use when adding to ParseContext if (keyClass == null) { - Optional<ComponentInfo> infoOpt = ComponentNameResolver.getComponentInfo(fieldName); - if (infoOpt.isPresent()) { - ComponentInfo info = infoOpt.get(); - keyClass = info.componentClass(); - isSelfConfiguring = info.selfConfiguring(); - contextKey = info.contextKey(); - LOG.debug("Resolved friendly name '{}' to class {} (selfConfiguring={}, contextKey={})", - fieldName, keyClass.getName(), isSelfConfiguring, - contextKey != null ? contextKey.getName() : "null"); + ComponentRegistry reg = getRegistry(); + if (reg != null && reg.hasComponent(fieldName)) { + try { + ComponentInfo info = reg.getComponentInfo(fieldName); + keyClass = info.componentClass(); + isSelfConfiguring = info.selfConfiguring(); + contextKey = info.contextKey(); + LOG.debug("Resolved friendly name '{}' to class {} (selfConfiguring={}, contextKey={})", + fieldName, keyClass.getName(), isSelfConfiguring, + contextKey != null ? contextKey.getName() : "null"); + } catch (TikaConfigException e) { + LOG.debug("Failed to get component info for '{}': {}", fieldName, e.getMessage()); + } } } else { // For FQCN resolution, check SelfConfiguring directly diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java index e07559fab..bca2ef54a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextSerializer.java @@ -29,7 +29,9 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.tika.config.ConfigContainer; +import org.apache.tika.config.loader.ComponentRegistry; import org.apache.tika.config.loader.TikaObjectMapperFactory; +import org.apache.tika.exception.TikaConfigException; import org.apache.tika.parser.ParseContext; /** @@ -55,12 +57,27 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { private static final Logger LOG = LoggerFactory.getLogger(ParseContextSerializer.class); public static final String PARSE_CONTEXT = "parseContext"; - // Full mapper with polymorphic type handling (includes WrapperObjectSerializer) private static final ObjectMapper MAPPER = TikaObjectMapperFactory.getMapper(); - // Plain mapper without WrapperObjectSerializer - for types with friendly names - // where the wrapper is added at the field name level by this serializer - private static final ObjectMapper PLAIN_MAPPER = new ObjectMapper(); + // Lazily loaded registry for looking up friendly names + private static volatile ComponentRegistry registry; + + private static ComponentRegistry getRegistry() { + if (registry == null) { + synchronized (ParseContextSerializer.class) { + if (registry == null) { + try { + registry = new ComponentRegistry("other-configs", + ParseContextSerializer.class.getClassLoader()); + } catch (TikaConfigException e) { + LOG.warn("Failed to load component registry for serialization", e); + // Return null - objects without friendly names won't be serialized + } + } + } + } + return registry; + } @Override public void serialize(ParseContext parseContext, JsonGenerator jsonGenerator, @@ -81,6 +98,7 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { // Then, serialize objects from ParseContext that have registered friendly names // or are stored under Tika type keys (for polymorphic custom subclasses) + ComponentRegistry reg = getRegistry(); Map<String, Object> contextMap = parseContext.getContextMap(); for (Map.Entry<String, Object> entry : contextMap.entrySet()) { // Skip ConfigContainer - already handled above @@ -94,8 +112,7 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { } // Try to get friendly name for this object's class - // Use ComponentNameResolver to ensure consistency with TikaObjectMapperFactory's registries - String friendlyName = ComponentNameResolver.getFriendlyName(value.getClass()); + String friendlyName = (reg != null) ? reg.getFriendlyName(value.getClass()) : null; // Determine key: prefer friendly name, fall back to FQCN for Tika types String key; @@ -112,18 +129,17 @@ public class ParseContextSerializer extends JsonSerializer<ParseContext> { if (!writtenKeys.contains(key)) { jsonGenerator.writeFieldName(key); - if (friendlyName != null) { - // Type has friendly name - use plain mapper to write properties directly - // (key already serves as the type identifier) - PLAIN_MAPPER.writeValue(jsonGenerator, value); - } else { - // No friendly name - add wrapper with FQCN and use MAPPER for - // polymorphic type handling of nested types - jsonGenerator.writeStartObject(); - jsonGenerator.writeFieldName(value.getClass().getName()); - MAPPER.writeValue(jsonGenerator, value); - jsonGenerator.writeEndObject(); + // Write wrapper object format with type info for polymorphic deserialization + // Format: {"concrete-class-name": {properties...}} + jsonGenerator.writeStartObject(); + String typeName = (friendlyName != null) ? friendlyName : + ComponentNameResolver.getFriendlyName(value.getClass()); + if (typeName == null) { + typeName = value.getClass().getName(); } + jsonGenerator.writeFieldName(typeName); + MAPPER.writeValue(jsonGenerator, value); + jsonGenerator.writeEndObject(); writtenKeys.add(key); } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java index 2a11b0e76..7c68042aa 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/TikaAbstractTypeMixins.java @@ -19,7 +19,6 @@ package org.apache.tika.serialization; import java.io.IOException; import java.lang.reflect.Modifier; -import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.BeanDescription; import com.fasterxml.jackson.databind.DeserializationConfig; @@ -27,13 +26,9 @@ import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.JsonNode; -import com.fasterxml.jackson.databind.JsonSerializer; import com.fasterxml.jackson.databind.ObjectMapper; -import com.fasterxml.jackson.databind.SerializationConfig; -import com.fasterxml.jackson.databind.SerializerProvider; import com.fasterxml.jackson.databind.deser.BeanDeserializerModifier; import com.fasterxml.jackson.databind.module.SimpleModule; -import com.fasterxml.jackson.databind.ser.BeanSerializerModifier; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -69,15 +64,12 @@ public final class TikaAbstractTypeMixins { /** * Registers the abstract type handling module on the given ObjectMapper. - * This includes both serializers (to add type wrappers) and deserializers - * (to resolve type wrappers). * * @param mapper the ObjectMapper to configure */ public static void registerDeserializers(ObjectMapper mapper) { SimpleModule module = new SimpleModule("TikaAbstractTypes"); module.setDeserializerModifier(new AbstractTypeDeserializerModifier(mapper)); - module.setSerializerModifier(new AbstractTypeSerializerModifier(mapper)); mapper.registerModule(module); } @@ -206,108 +198,4 @@ public final class TikaAbstractTypeMixins { } } } - - /** - * Modifier that intercepts serialization of values declared as abstract types - * and wraps them with type information. - */ - private static class AbstractTypeSerializerModifier extends BeanSerializerModifier { - - private final ObjectMapper mapper; - - AbstractTypeSerializerModifier(ObjectMapper mapper) { - this.mapper = mapper; - } - - @Override - public JsonSerializer<?> modifySerializer(SerializationConfig config, - BeanDescription beanDesc, - JsonSerializer<?> serializer) { - Class<?> beanClass = beanDesc.getBeanClass(); - - // Skip types that shouldn't use wrapper format - if (shouldSkip(beanClass)) { - return serializer; - } - - // For concrete Tika types, wrap with type name if they extend/implement an abstract type - // This ensures polymorphic types in lists get properly wrapped - if (isTikaPolymorphicType(beanClass)) { - LOG.debug("Registering wrapper serializer for polymorphic type: {}", - beanClass.getName()); - return new WrapperObjectSerializer<>(serializer, mapper); - } - - return serializer; - } - - private boolean shouldSkip(Class<?> beanClass) { - // Skip primitives and their wrappers - if (beanClass.isPrimitive()) { - return true; - } - - // Skip common JDK types - String name = beanClass.getName(); - if (name.startsWith("java.") || name.startsWith("javax.")) { - return true; - } - - // Skip arrays - if (beanClass.isArray()) { - return true; - } - - // Skip abstract types (we want to wrap concrete implementations, not the abstract types themselves) - if (beanClass.isInterface() || Modifier.isAbstract(beanClass.getModifiers())) { - return true; - } - - return false; - } - - /** - * Checks if this class should be wrapped with type information during serialization. - * Only types registered in the component registry are wrapped - this excludes - * container types (like CompositeMetadataFilter) that are not in the registry. - */ - private boolean isTikaPolymorphicType(Class<?> beanClass) { - // Only wrap types that have a registered friendly name in the registry - return ComponentNameResolver.getFriendlyName(beanClass) != null; - } - } - - /** - * Serializer that wraps objects with their type name. - * Output format: {"type-name": {...properties...}} - */ - private static class WrapperObjectSerializer<T> extends JsonSerializer<T> { - - private final JsonSerializer<T> delegate; - private final ObjectMapper mapper; - - @SuppressWarnings("unchecked") - WrapperObjectSerializer(JsonSerializer<?> delegate, ObjectMapper mapper) { - this.delegate = (JsonSerializer<T>) delegate; - this.mapper = mapper; - } - - @Override - public void serialize(T value, JsonGenerator gen, SerializerProvider serializers) - throws IOException { - if (value == null) { - gen.writeNull(); - return; - } - - // Get the friendly name (guaranteed to exist since we only wrap registered types) - String typeName = ComponentNameResolver.getFriendlyName(value.getClass()); - - // Write wrapper: {"type-name": {...}} - gen.writeStartObject(); - gen.writeFieldName(typeName); - delegate.serialize(value, gen, serializers); - gen.writeEndObject(); - } - } } diff --git a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java index 5292ece26..3b06f4079 100644 --- a/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java +++ b/tika-serialization/src/test/java/org/apache/tika/serialization/TestParseContextSerialization.java @@ -23,7 +23,6 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.StringWriter; import java.io.Writer; -import java.util.List; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonNode; @@ -39,7 +38,6 @@ import org.apache.tika.extractor.SkipEmbeddedDocumentSelector; import org.apache.tika.metadata.filter.AttachmentCountingListFilter; import org.apache.tika.metadata.filter.CompositeMetadataFilter; import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.metadata.filter.MockUpperCaseFilter; import org.apache.tika.parser.ParseContext; /** @@ -302,7 +300,7 @@ public class TestParseContextSerialization { } @Test - public void testMetadataListConfigContainer() throws Exception { + public void testMetadataList() throws Exception { ConfigContainer configContainer = new ConfigContainer(); configContainer.set("metadata-filters", """ [ @@ -324,25 +322,6 @@ public class TestParseContextSerialization { assertEquals(AttachmentCountingListFilter.class, deserFilter.getFilters().get(0).getClass()); } - - @Test - public void testMetadataListPOJO() throws Exception { - CompositeMetadataFilter metadataFilter = new CompositeMetadataFilter(List.of(new AttachmentCountingListFilter(), new MockUpperCaseFilter())); - - ParseContext parseContext = new ParseContext(); - parseContext.set(MetadataFilter.class, metadataFilter); - - ObjectMapper mapper = createMapper(); - String json = mapper.writeValueAsString(parseContext); - - ParseContext deser = mapper.readValue(json, ParseContext.class); - MetadataFilter resolvedFilter = deser.get(MetadataFilter.class); - assertNotNull(resolvedFilter, "MetadataFilter should be resolved"); - assertEquals(CompositeMetadataFilter.class, resolvedFilter.getClass()); - CompositeMetadataFilter deserFilter = (CompositeMetadataFilter) resolvedFilter; - assertEquals(AttachmentCountingListFilter.class, deserFilter.getFilters().get(0).getClass()); - } - @Test public void testContextKeyDeserialization() throws Exception { // Test that components with @TikaComponent(contextKey=...) are stored
