This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4585-simplify-serialization
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 47df95adc04778a99e0ed7272889e8dac268f70f
Author: tallison <[email protected]>
AuthorDate: Sat Dec 20 15:48:41 2025 -0500

    TIKA-4585 -- further progress - WIP
---
 .../java/org/apache/tika/parser/ParseContext.java  | 10 ++-
 tika-pipes/tika-pipes-core/pom.xml                 |  5 ++
 .../serialization/FetchEmitTupleDeserializer.java  |  4 +-
 .../tika/serialization/ParseContextUtils.java      |  2 +-
 .../serdes/ParseContextDeserializer.java           | 75 ++++++++++++++++++----
 .../serdes/ParseContextSerializer.java             | 57 ++++++++++++++--
 6 files changed, 131 insertions(+), 22 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java 
b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
index db4a0e157..137fa9bca 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java
@@ -134,7 +134,15 @@ public class ParseContext implements Serializable {
      * @since Apache Tika 4.0
      */
     public void setJsonConfig(String name, String json) {
-        setJsonConfig(name, json != null ? () -> json : null);
+        setJsonConfig(name, json != null ? new StringJsonConfig(json) : null);
+    }
+
+    /**
+     * A simple Serializable implementation of JsonConfig that holds a JSON 
string.
+     * This is used internally to ensure JSON configs can be serialized via 
Java serialization.
+     */
+    private record StringJsonConfig(String json) implements JsonConfig, 
Serializable {
+        private static final long serialVersionUID = 1L;
     }
 
     /**
diff --git a/tika-pipes/tika-pipes-core/pom.xml 
b/tika-pipes/tika-pipes-core/pom.xml
index b5c873221..d3b5fe2f9 100644
--- a/tika-pipes/tika-pipes-core/pom.xml
+++ b/tika-pipes/tika-pipes-core/pom.xml
@@ -32,6 +32,11 @@
   <url>https://tika.apache.org/</url>
 
   <dependencies>
+    <dependency>
+      <groupId>${project.groupId}</groupId>
+      <artifactId>tika-annotation-processor</artifactId>
+      <version>${project.version}</version>
+    </dependency>
     <dependency>
       <groupId>${project.groupId}</groupId>
       <artifactId>tika-pipes-api</artifactId>
diff --git 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
index be76398d7..a1d531f5f 100644
--- 
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
+++ 
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/serialization/FetchEmitTupleDeserializer.java
@@ -36,6 +36,7 @@ import com.fasterxml.jackson.core.JsonParser;
 import com.fasterxml.jackson.databind.DeserializationContext;
 import com.fasterxml.jackson.databind.JsonDeserializer;
 import com.fasterxml.jackson.databind.JsonNode;
+import com.fasterxml.jackson.databind.ObjectMapper;
 
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
@@ -50,6 +51,7 @@ public class FetchEmitTupleDeserializer extends 
JsonDeserializer<FetchEmitTuple>
     @Override
     public FetchEmitTuple deserialize(JsonParser jsonParser, 
DeserializationContext deserializationContext) throws IOException, 
JacksonException {
         JsonNode root = jsonParser.readValueAsTree();
+        ObjectMapper mapper = (ObjectMapper) jsonParser.getCodec();
 
         String id = readVal(ID, root, null, true);
         String fetcherId = readVal(FETCHER, root, null, true);
@@ -60,7 +62,7 @@ public class FetchEmitTupleDeserializer extends 
JsonDeserializer<FetchEmitTuple>
         long fetchRangeEnd = readLong(FETCH_RANGE_END, root, -1l, false);
         Metadata metadata = readMetadata(root);
         JsonNode parseContextNode = root.get(PARSE_CONTEXT);
-        ParseContext parseContext = parseContextNode == null ? new 
ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode);
+        ParseContext parseContext = parseContextNode == null ? new 
ParseContext() : ParseContextDeserializer.readParseContext(parseContextNode, 
mapper);
         // Resolve all friendly-named components from jsonConfigs to actual 
objects
         ParseContextUtils.resolveAll(parseContext, 
FetchEmitTupleDeserializer.class.getClassLoader());
         FetchEmitTuple.ON_PARSE_EXCEPTION onParseException = 
readOnParseException(root);
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
index d31d5cd50..e530c1951 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/ParseContextUtils.java
@@ -152,7 +152,7 @@ public class ParseContextUtils {
                     // Determine the context key
                     Class<?> contextKey = determineContextKey(info, 
friendlyName);
 
-                    // Deserialize and cache
+                    // Deserialize and cache in resolvedConfigs, also add to 
context
                     Object instance = MAPPER.readValue(jsonConfig.json(), 
info.componentClass());
                     context.setResolvedConfig(friendlyName, instance);
                     context.set((Class) contextKey, instance);
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
index 2e566f4fa..3839ce57a 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextDeserializer.java
@@ -17,6 +17,7 @@
 package org.apache.tika.serialization.serdes;
 
 import static 
org.apache.tika.serialization.serdes.ParseContextSerializer.PARSE_CONTEXT;
+import static 
org.apache.tika.serialization.serdes.ParseContextSerializer.TYPED;
 
 import java.io.IOException;
 import java.util.Iterator;
@@ -26,26 +27,35 @@ import 
com.fasterxml.jackson.databind.DeserializationContext;
 import com.fasterxml.jackson.databind.JsonDeserializer;
 import com.fasterxml.jackson.databind.JsonNode;
 import com.fasterxml.jackson.databind.ObjectMapper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
 import org.apache.tika.parser.ParseContext;
-import org.apache.tika.serialization.ConfigDeserializer;
+import org.apache.tika.serialization.ComponentNameResolver;
 
 /**
  * Deserializes ParseContext from JSON.
  * <p>
- * Each field in the JSON object is stored as a JSON config in the 
ParseContext.
- * Resolution to typed objects happens later via {@link ConfigDeserializer}.
+ * Handles two types of entries:
+ * <ul>
+ *   <li>"typed" section: Deserialized directly to typed objects in the 
context map</li>
+ *   <li>Other entries: Stored as JSON configs for lazy resolution</li>
+ * </ul>
  * <p>
  * Example input:
  * <pre>
  * {
- *   "pdf-parser": {"ocrStrategy": "AUTO"},
- *   "handler-config": {"type": "XML", "parseMode": "RMETA"}
+ *   "typed": {
+ *     "handler-config": {"type": "XML", "parseMode": "RMETA"}
+ *   },
+ *   "metadata-filters": ["mock-upper-case-filter"]
  * }
  * </pre>
  */
 public class ParseContextDeserializer extends JsonDeserializer<ParseContext> {
 
+    private static final Logger LOG = 
LoggerFactory.getLogger(ParseContextDeserializer.class);
+
     @Override
     public ParseContext deserialize(JsonParser jsonParser, 
DeserializationContext ctxt)
             throws IOException {
@@ -56,12 +66,12 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
     /**
      * Deserializes a ParseContext from a JsonNode.
      * <p>
-     * Each field is stored as a JSON config string in the ParseContext's 
jsonConfigs map.
-     * The configs can later be resolved to typed objects via {@link 
ConfigDeserializer}.
+     * The "typed" section is deserialized directly to typed objects in the 
context map.
+     * All other fields are stored as JSON config strings for lazy resolution.
      *
      * @param jsonNode the JSON node containing the ParseContext data
-     * @param mapper   the ObjectMapper for serializing field values back to 
JSON strings
-     * @return the deserialized ParseContext with jsonConfigs populated
+     * @param mapper   the ObjectMapper for deserializing typed objects
+     * @return the deserialized ParseContext
      * @throws IOException if deserialization fails
      */
     public static ParseContext readParseContext(JsonNode jsonNode, 
ObjectMapper mapper)
@@ -78,16 +88,55 @@ public class ParseContextDeserializer extends 
JsonDeserializer<ParseContext> {
             return parseContext;
         }
 
-        // Store each field as a JSON config
         Iterator<String> fieldNames = contextNode.fieldNames();
         while (fieldNames.hasNext()) {
             String name = fieldNames.next();
             JsonNode value = contextNode.get(name);
-            // Store the JSON string for later resolution
-            String json = mapper.writeValueAsString(value);
-            parseContext.setJsonConfig(name, json);
+
+            if (TYPED.equals(name)) {
+                // Deserialize typed objects directly to context map
+                deserializeTypedObjects(value, parseContext, mapper);
+            } else {
+                // Store as JSON config for lazy resolution
+                String json = mapper.writeValueAsString(value);
+                parseContext.setJsonConfig(name, json);
+            }
         }
 
         return parseContext;
     }
+
+    /**
+     * Deserializes the "typed" section into typed objects in the context map.
+     */
+    @SuppressWarnings("unchecked")
+    private static void deserializeTypedObjects(JsonNode typedNode, 
ParseContext parseContext,
+                                                 ObjectMapper mapper) throws 
IOException {
+        if (!typedNode.isObject()) {
+            return;
+        }
+
+        Iterator<String> fieldNames = typedNode.fieldNames();
+        while (fieldNames.hasNext()) {
+            String componentName = fieldNames.next();
+            JsonNode configNode = typedNode.get(componentName);
+
+            try {
+                // Look up the class for this component name
+                Class<?> configClass = ComponentNameResolver.resolveClass(
+                        componentName, 
ParseContextDeserializer.class.getClassLoader());
+
+                // Deserialize and add to context
+                Object config = mapper.treeToValue(configNode, configClass);
+                parseContext.set((Class) configClass, config);
+
+                LOG.debug("Deserialized typed object '{}' -> {}", 
componentName, configClass.getName());
+            } catch (ClassNotFoundException e) {
+                LOG.warn("Could not find class for typed component '{}', 
storing as JSON config",
+                        componentName);
+                // Fall back to storing as JSON config
+                parseContext.setJsonConfig(componentName, 
mapper.writeValueAsString(configNode));
+            }
+        }
+    }
 }
diff --git 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
index f0648e5f8..0e966f046 100644
--- 
a/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
+++ 
b/tika-serialization/src/main/java/org/apache/tika/serialization/serdes/ParseContextSerializer.java
@@ -21,42 +21,87 @@ import java.util.Map;
 
 import com.fasterxml.jackson.core.JsonGenerator;
 import com.fasterxml.jackson.databind.JsonSerializer;
+import com.fasterxml.jackson.databind.ObjectMapper;
 import com.fasterxml.jackson.databind.SerializerProvider;
 
 import org.apache.tika.config.JsonConfig;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.serialization.ComponentNameResolver;
 
 /**
  * Serializes ParseContext to JSON.
  * <p>
- * Serializes the jsonConfigs map directly - each entry becomes a field in the 
output.
- * The resolvedConfigs cache is intentionally ignored to preserve round-trip 
fidelity.
+ * Typed objects from the context map are serialized under a "typed" key.
+ * JSON configs are serialized at the top level.
  * <p>
  * Example output:
  * <pre>
  * {
- *   "pdf-parser": {"ocrStrategy": "AUTO"},
- *   "handler-config": {"type": "XML", "parseMode": "RMETA"}
+ *   "typed": {
+ *     "handler-config": {"type": "XML", "parseMode": "RMETA"}
+ *   },
+ *   "metadata-filters": ["mock-upper-case-filter"]
  * }
  * </pre>
  */
 public class ParseContextSerializer extends JsonSerializer<ParseContext> {
 
     public static final String PARSE_CONTEXT = "parseContext";
+    public static final String TYPED = "typed";
 
     @Override
     public void serialize(ParseContext parseContext, JsonGenerator gen,
                          SerializerProvider serializers) throws IOException {
         gen.writeStartObject();
 
-        // Serialize all JSON configs - this is the source of truth for 
round-trip
+        ObjectMapper mapper = (ObjectMapper) gen.getCodec();
+
+        // First, serialize typed objects from the context map under "typed" 
key
+        Map<String, Object> contextMap = parseContext.getContextMap();
+        boolean hasTypedObjects = false;
+
+        for (Map.Entry<String, Object> entry : contextMap.entrySet()) {
+            String className = entry.getKey();
+            String componentName = findComponentName(className);
+            if (componentName != null) {
+                if (!hasTypedObjects) {
+                    gen.writeFieldName(TYPED);
+                    gen.writeStartObject();
+                    hasTypedObjects = true;
+                }
+                gen.writeFieldName(componentName);
+                gen.writeRawValue(mapper.writeValueAsString(entry.getValue()));
+            }
+        }
+
+        if (hasTypedObjects) {
+            gen.writeEndObject();
+        }
+
+        // Then, serialize JSON configs at the top level
         Map<String, JsonConfig> jsonConfigs = parseContext.getJsonConfigs();
         for (Map.Entry<String, JsonConfig> entry : jsonConfigs.entrySet()) {
             gen.writeFieldName(entry.getKey());
-            // Write the JSON config as raw value (it's already valid JSON)
             gen.writeRawValue(entry.getValue().json());
         }
 
         gen.writeEndObject();
     }
+
+    /**
+     * Finds the component name for a class.
+     * Uses ComponentNameResolver for registry lookup. Only classes registered
+     * in a component registry will be serialized.
+     *
+     * @param className the fully qualified class name
+     * @return the component name, or null if not registered
+     */
+    private String findComponentName(String className) {
+        try {
+            Class<?> clazz = Class.forName(className);
+            return ComponentNameResolver.getFriendlyName(clazz);
+        } catch (ClassNotFoundException e) {
+            return null;
+        }
+    }
 }

Reply via email to