This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4585-simplify-serialization in repository https://gitbox.apache.org/repos/asf/tika.git
commit 47df95adc04778a99e0ed7272889e8dac268f70f Author: tallison <[email protected]> AuthorDate: Sat Dec 20 15:48:41 2025 -0500 TIKA-4585 -- further progress - WIP --- .../java/org/apache/tika/parser/ParseContext.java | 10 ++- tika-pipes/tika-pipes-core/pom.xml | 5 ++ .../serialization/FetchEmitTupleDeserializer.java | 4 +- .../tika/serialization/ParseContextUtils.java | 2 +- .../serdes/ParseContextDeserializer.java | 75 ++++++++++++++++++---- .../serdes/ParseContextSerializer.java | 57 ++++++++++++++-- 6 files changed, 131 insertions(+), 22 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index db4a0e157..137fa9bca 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -134,7 +134,15 @@ public class ParseContext implements Serializable { * @since Apache Tika 4.0 */ public void setJsonConfig(String name, String json) { - setJsonConfig(name, json != null ? () -> json : null); + setJsonConfig(name, json != null ? new StringJsonConfig(json) : null); + } + + /** + * A simple Serializable implementation of JsonConfig that holds a JSON string. + * This is used internally to ensure JSON configs can be serialized via Java serialization. + */ + private record StringJsonConfig(String json) implements JsonConfig, Serializable { + private static final long serialVersionUID = 1L; } /** diff --git a/tika-pipes/tika-pipes-core/pom.xml b/tika-pipes/tika-pipes-core/pom.xml index b5c873221..d3b5fe2f9 100644 --- a/tika-pipes/tika-pipes-core/pom.xml +++ b/tika-pipes/tika-pipes-core/pom.xml @@ -32,6 +32,11 @@ <url>https://tika.apache.org/</url> <dependencies> + <dependency> + <groupId>${project.groupId}</groupId> + <artifactId>tika-annotation-processor</artifactId> + <version>${project.version}</version> + </dependency> <dependency> <groupId>${project.groupId}</groupId> <artifactId>tika-pipes-api</artifactId> diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java index be76398d7..a1d531f5f 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java @@ -36,6 +36,7 @@ import com.fasterxml.jackson.core.JsonParser; import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; @@ -50,6 +51,7 @@ public class FetchEmitTupleDeserializer extends JsonDeserializer<FetchEmitTuple> @Override public FetchEmitTuple deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException, JacksonException { JsonNode root = jsonParser.readValueAsTree(); + ObjectMapper mapper = (ObjectMapper) jsonParser.getCodec(); String id = readVal(ID, root, null, true); String fetcherId = readVal(FETCHER, root, null, true); @@ -60,7 +62,7 @@ public class FetchEmitTupleDeserializer extends JsonDeserializer<FetchEmitTuple> long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false); Metadata metadata = readMetadata(root); JsonNode parseContextNode = root.get(PARSE_CONTEXT); - ParseContext parseContext = parseContextNode == null ? new ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode); + ParseContext parseContext = parseContextNode == null ? new ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode, mapper); // Resolve all friendly-named components from jsonConfigs to actual objects ParseContextUtils.resolveAll(parseContext, FetchEmitTupleDeserializer.class.getClassLoader()); FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = readOnParseException(root); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java index d31d5cd50..e530c1951 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java @@ -152,7 +152,7 @@ public class ParseContextUtils { // Determine the context key Class<?> contextKey = determineContextKey(info, friendlyName); - // Deserialize and cache + // Deserialize and cache in resolvedConfigs, also add to context Object instance = MAPPER.readValue(jsonConfig.json(), info.componentClass()); context.setResolvedConfig(friendlyName, instance); context.set((Class) contextKey, instance); diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java index 2e566f4fa..3839ce57a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java @@ -17,6 +17,7 @@ package org.apache.tika.serialization.serdes; import static org.apache.tika.serialization.serdes.ParseContextSerializer.PARSE_CONTEXT; +import static org.apache.tika.serialization.serdes.ParseContextSerializer.TYPED; import java.io.IOException; import java.util.Iterator; @@ -26,26 +27,35 @@ import com.fasterxml.jackson.databind.DeserializationContext; import com.fasterxml.jackson.databind.JsonDeserializer; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import org.apache.tika.parser.ParseContext; -import org.apache.tika.serialization.ConfigDeserializer; +import org.apache.tika.serialization.ComponentNameResolver; /** * Deserializes ParseContext from JSON. * <p> - * Each field in the JSON object is stored as a JSON config in the ParseContext. - * Resolution to typed objects happens later via {@link ConfigDeserializer}. + * Handles two types of entries: + * <ul> + * <li>"typed" section: Deserialized directly to typed objects in the context map</li> + * <li>Other entries: Stored as JSON configs for lazy resolution</li> + * </ul> * <p> * Example input: * <pre> * { - * "pdf-parser": {"ocrStrategy": "AUTO"}, - * "handler-config": {"type": "XML", "parseMode": "RMETA"} + * "typed": { + * "handler-config": {"type": "XML", "parseMode": "RMETA"} + * }, + * "metadata-filters": ["mock-upper-case-filter"] * } * </pre> */ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { + private static final Logger LOG = LoggerFactory.getLogger(ParseContextDeserializer.class); + @Override public ParseContext deserialize(JsonParser jsonParser, DeserializationContext ctxt) throws IOException { @@ -56,12 +66,12 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { /** * Deserializes a ParseContext from a JsonNode. * <p> - * Each field is stored as a JSON config string in the ParseContext's jsonConfigs map. - * The configs can later be resolved to typed objects via {@link ConfigDeserializer}. + * The "typed" section is deserialized directly to typed objects in the context map. + * All other fields are stored as JSON config strings for lazy resolution. * * @param jsonNode the JSON node containing the ParseContext data - * @param mapper the ObjectMapper for serializing field values back to JSON strings - * @return the deserialized ParseContext with jsonConfigs populated + * @param mapper the ObjectMapper for deserializing typed objects + * @return the deserialized ParseContext * @throws IOException if deserialization fails */ public static ParseContext readParseContext(JsonNode jsonNode, ObjectMapper mapper) @@ -78,16 +88,55 @@ public class ParseContextDeserializer extends JsonDeserializer<ParseContext> { return parseContext; } - // Store each field as a JSON config Iterator<String> fieldNames = contextNode.fieldNames(); while (fieldNames.hasNext()) { String name = fieldNames.next(); JsonNode value = contextNode.get(name); - // Store the JSON string for later resolution - String json = mapper.writeValueAsString(value); - parseContext.setJsonConfig(name, json); + + if (TYPED.equals(name)) { + // Deserialize typed objects directly to context map + deserializeTypedObjects(value, parseContext, mapper); + } else { + // Store as JSON config for lazy resolution + String json = mapper.writeValueAsString(value); + parseContext.setJsonConfig(name, json); + } } return parseContext; } + + /** + * Deserializes the "typed" section into typed objects in the context map. + */ + @SuppressWarnings("unchecked") + private static void deserializeTypedObjects(JsonNode typedNode, ParseContext parseContext, + ObjectMapper mapper) throws IOException { + if (!typedNode.isObject()) { + return; + } + + Iterator<String> fieldNames = typedNode.fieldNames(); + while (fieldNames.hasNext()) { + String componentName = fieldNames.next(); + JsonNode configNode = typedNode.get(componentName); + + try { + // Look up the class for this component name + Class<?> configClass = ComponentNameResolver.resolveClass( + componentName, ParseContextDeserializer.class.getClassLoader()); + + // Deserialize and add to context + Object config = mapper.treeToValue(configNode, configClass); + parseContext.set((Class) configClass, config); + + LOG.debug("Deserialized typed object '{}' -> {}", componentName, configClass.getName()); + } catch (ClassNotFoundException e) { + LOG.warn("Could not find class for typed component '{}', storing as JSON config", + componentName); + // Fall back to storing as JSON config + parseContext.setJsonConfig(componentName, mapper.writeValueAsString(configNode)); + } + } + } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java index f0648e5f8..0e966f046 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java @@ -21,42 +21,87 @@ import java.util.Map; import com.fasterxml.jackson.core.JsonGenerator; import com.fasterxml.jackson.databind.JsonSerializer; +import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.SerializerProvider; import org.apache.tika.config.JsonConfig; import org.apache.tika.parser.ParseContext; +import org.apache.tika.serialization.ComponentNameResolver; /** * Serializes ParseContext to JSON. * <p> - * Serializes the jsonConfigs map directly - each entry becomes a field in the output. - * The resolvedConfigs cache is intentionally ignored to preserve round-trip fidelity. + * Typed objects from the context map are serialized under a "typed" key. + * JSON configs are serialized at the top level. * <p> * Example output: * <pre> * { - * "pdf-parser": {"ocrStrategy": "AUTO"}, - * "handler-config": {"type": "XML", "parseMode": "RMETA"} + * "typed": { + * "handler-config": {"type": "XML", "parseMode": "RMETA"} + * }, + * "metadata-filters": ["mock-upper-case-filter"] * } * </pre> */ public class ParseContextSerializer extends JsonSerializer<ParseContext> { public static final String PARSE_CONTEXT = "parseContext"; + public static final String TYPED = "typed"; @Override public void serialize(ParseContext parseContext, JsonGenerator gen, SerializerProvider serializers) throws IOException { gen.writeStartObject(); - // Serialize all JSON configs - this is the source of truth for round-trip + ObjectMapper mapper = (ObjectMapper) gen.getCodec(); + + // First, serialize typed objects from the context map under "typed" key + Map<String, Object> contextMap = parseContext.getContextMap(); + boolean hasTypedObjects = false; + + for (Map.Entry<String, Object> entry : contextMap.entrySet()) { + String className = entry.getKey(); + String componentName = findComponentName(className); + if (componentName != null) { + if (!hasTypedObjects) { + gen.writeFieldName(TYPED); + gen.writeStartObject(); + hasTypedObjects = true; + } + gen.writeFieldName(componentName); + gen.writeRawValue(mapper.writeValueAsString(entry.getValue())); + } + } + + if (hasTypedObjects) { + gen.writeEndObject(); + } + + // Then, serialize JSON configs at the top level Map<String, JsonConfig> jsonConfigs = parseContext.getJsonConfigs(); for (Map.Entry<String, JsonConfig> entry : jsonConfigs.entrySet()) { gen.writeFieldName(entry.getKey()); - // Write the JSON config as raw value (it's already valid JSON) gen.writeRawValue(entry.getValue().json()); } gen.writeEndObject(); } + + /** + * Finds the component name for a class. + * Uses ComponentNameResolver for registry lookup. Only classes registered + * in a component registry will be serialized. + * + * @param className the fully qualified class name + * @return the component name, or null if not registered + */ + private String findComponentName(String className) { + try { + Class<?> clazz = Class.forName(className); + return ComponentNameResolver.getFriendlyName(clazz); + } catch (ClassNotFoundException e) { + return null; + } + } }
