This is an automated email from the ASF dual-hosted git repository. ndipiazza pushed a commit to branch TIKA-4851-revert in repository https://gitbox.apache.org/repos/asf/tika.git
commit 44c6c56a4f020b1722b8d8ef6cd5c77d8fde42e3 Author: Nicholas DiPiazza <[email protected]> AuthorDate: Fri Dec 19 13:38:24 2025 -0600 Revert "Clean up jackson settings on metadata list serialization/deserialization (#2464)" This reverts commit fbf5f82527d95dba3c95b346d73d1bb4102c34d1. --- .../org/apache/tika/config/GlobalSettings.java | 57 ++++++++++++++ .../apache/tika/config/loader/TikaJsonConfig.java | 2 +- .../org/apache/tika/config/loader/TikaLoader.java | 60 +++------------ .../apache/tika/serialization/JsonMetadata.java | 89 +++++++++------------- .../tika/serialization/JsonMetadataList.java | 74 ++++++------------ .../test/resources/configs/tika-config-json.json | 6 +- .../test/resources/configs/tika-config-json.json | 4 +- 7 files changed, 129 insertions(+), 163 deletions(-) diff --git a/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java b/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java index 7493000ae..7d07c3b9e 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/GlobalSettings.java @@ -25,6 +25,7 @@ import com.fasterxml.jackson.annotation.JsonProperty; * <p>Example JSON: * <pre> * { + * "maxJsonStringFieldLength": 50000000, * "xml-reader-utils": { * "maxEntityExpansions": 1000, * "maxNumReuses": 100, @@ -35,6 +36,20 @@ import com.fasterxml.jackson.annotation.JsonProperty; */ public class GlobalSettings { + /** + * Static maximum length for JSON string fields. + * Default: 20,000,000 (Jackson's default) + * This is static because it's a global setting that affects all JSON parsing. + */ + private static Integer maxJsonStringFieldLength = 20_000_000; + + /** + * Instance field for deserialization from JSON. + * The value is copied to the static field when set. + */ + @JsonProperty("maxJsonStringFieldLength") + private Integer instanceMaxJsonStringFieldLength = 20_000_000; + /** * Service loader configuration for handling initialization problems. */ @@ -47,6 +62,48 @@ public class GlobalSettings { @JsonProperty("xml-reader-utils") private XmlReaderUtilsConfig xmlReaderUtils; + /** + * Gets the static maximum JSON string field length. + * + * @return the max length, or null if not set + */ + public static Integer getMaxJsonStringFieldLength() { + return maxJsonStringFieldLength; + } + + /** + * Sets the static maximum JSON string field length. + * This affects all JSON parsing globally. + * + * @param length the max length to set + */ + public static void setMaxJsonStringFieldLength(Integer length) { + maxJsonStringFieldLength = length; + } + + /** + * Instance getter for deserialization. + * Returns the instance value which may differ from the static value. + * + * @return the instance max length + */ + public Integer getInstanceMaxJsonStringFieldLength() { + return instanceMaxJsonStringFieldLength; + } + + /** + * Instance setter for deserialization. + * Automatically updates the static field when set. + * + * @param length the max length to set + */ + public void setInstanceMaxJsonStringFieldLength(Integer length) { + this.instanceMaxJsonStringFieldLength = length; + if (length != null) { + setMaxJsonStringFieldLength(length); + } + } + public ServiceLoaderConfig getServiceLoader() { return serviceLoader; } diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java index 8ce14a30f..2eeb8bc7a 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaJsonConfig.java @@ -103,7 +103,7 @@ public class TikaJsonConfig { */ private static final Set<String> KNOWN_KEYS = Set.of( // Globals - "metadata-list", + "maxJsonStringFieldLength", "service-loader", "xml-reader-utils", // Core Tika component keys diff --git a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java index b82beb7d8..52e17d9d1 100644 --- a/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java +++ b/tika-serialization/src/main/java/org/apache/tika/config/loader/TikaLoader.java @@ -21,8 +21,6 @@ import java.nio.file.Path; import java.util.Collections; import java.util.List; -import com.fasterxml.jackson.core.StreamReadConstraints; -import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.tika.config.GlobalSettings; @@ -41,8 +39,6 @@ import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.Parser; import org.apache.tika.renderer.CompositeRenderer; import org.apache.tika.renderer.Renderer; -import org.apache.tika.serialization.JsonMetadata; -import org.apache.tika.serialization.JsonMetadataList; /** * Main entry point for loading Tika components from JSON configuration. @@ -400,18 +396,17 @@ public class TikaLoader { * * <p>Settings include: * <ul> - * <li>metadata-list - Jackson StreamReadConstraints for JsonMetadata/JsonMetadataList serialization</li> - * <li>service-loader - Service loader configuration</li> + * <li>maxJsonStringFieldLength - Maximum JSON string field length (static, affects all JSON parsing)</li> + * <li>service-loader.initializableProblemHandler - How to handle initialization problems</li> * <li>xml-reader-utils - XML parser security settings</li> * </ul> * * <p>Example JSON: * <pre> * { - * "metadata-list": { - * "maxStringLength": 50000000, - * "maxNestingDepth": 10, - * "maxNumberLength": 500 + * "maxJsonStringFieldLength": 50000000, + * "service-loader": { + * "initializableProblemHandler": "ignore" * }, * "xml-reader-utils": { * "maxEntityExpansions": 1000, @@ -428,8 +423,11 @@ public class TikaLoader { if (globalSettings == null) { globalSettings = new GlobalSettings(); - // Load metadata-list config for JsonMetadata/JsonMetadataList serialization - loadMetadataListConfig(); + // Load maxJsonStringFieldLength from top level and set it statically + if (config.getRootNode().has("maxJsonStringFieldLength")) { + GlobalSettings.setMaxJsonStringFieldLength( + config.getRootNode().get("maxJsonStringFieldLength").asInt()); + } // Load service-loader config (official Tika config at root level) GlobalSettings.ServiceLoaderConfig serviceLoaderConfig = @@ -448,44 +446,6 @@ public class TikaLoader { return globalSettings; } - /** - * Loads the metadata-list configuration section and applies it to - * JsonMetadata and JsonMetadataList serializers. - * <p> - * Configuration uses Jackson's StreamReadConstraints property names: - * <pre> - * { - * "metadata-list": { - * "maxStringLength": 20000000, - * "maxNestingDepth": 10, - * "maxNumberLength": 500 - * } - * } - * </pre> - */ - private void loadMetadataListConfig() { - JsonNode metadataListNode = config.getRootNode().get("metadata-list"); - if (metadataListNode == null) { - return; - } - - StreamReadConstraints.Builder builder = StreamReadConstraints.builder(); - - if (metadataListNode.has("maxStringLength")) { - builder.maxStringLength(metadataListNode.get("maxStringLength").asInt()); - } - if (metadataListNode.has("maxNestingDepth")) { - builder.maxNestingDepth(metadataListNode.get("maxNestingDepth").asInt()); - } - if (metadataListNode.has("maxNumberLength")) { - builder.maxNumberLength(metadataListNode.get("maxNumberLength").asInt()); - } - - StreamReadConstraints constraints = builder.build(); - JsonMetadata.setStreamReadConstraints(constraints); - JsonMetadataList.setStreamReadConstraints(constraints); - } - /** * Gets the global settings if they have been loaded. * diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java index 504fb4f19..e9adec234 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadata.java @@ -26,69 +26,22 @@ import com.fasterxml.jackson.core.StreamReadConstraints; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; +import org.apache.tika.config.GlobalSettings; import org.apache.tika.metadata.Metadata; public class JsonMetadata { static volatile boolean PRETTY_PRINT = false; - /** - * Default stream read constraints for metadata serialization. - */ - private static final StreamReadConstraints DEFAULT_CONSTRAINTS = StreamReadConstraints - .builder() - .maxNestingDepth(10) - .maxStringLength(20_000_000) - .maxNumberLength(500) - .build(); - - private static volatile StreamReadConstraints streamReadConstraints = DEFAULT_CONSTRAINTS; - private static volatile ObjectMapper OBJECT_MAPPER; - private static volatile ObjectMapper PRETTY_SERIALIZER; + private static ObjectMapper OBJECT_MAPPER; + private static final ObjectMapper PRETTY_SERIALIZER; static { - rebuildObjectMappers(); - } - - private static void rebuildObjectMappers() { - JsonFactory factory = new JsonFactory(); - factory.setStreamReadConstraints(streamReadConstraints); - - ObjectMapper mapper = new ObjectMapper(factory); - SimpleModule baseModule = new SimpleModule(); - baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); - baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - mapper.registerModule(baseModule); - OBJECT_MAPPER = mapper; - - ObjectMapper prettyMapper = new ObjectMapper(factory); + OBJECT_MAPPER = buildObjectMapper(StreamReadConstraints.DEFAULT_MAX_STRING_LEN); + PRETTY_SERIALIZER = new ObjectMapper(); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); - prettyMapper.registerModule(prettySerializerModule); - PRETTY_SERIALIZER = prettyMapper; - } - - /** - * Sets the stream read constraints for JSON parsing of metadata. - * This affects all subsequent calls to {@link #fromJson(Reader)}. - * <p> - * Typically called by TikaLoader during initialization based on the - * "metadata-list" configuration section. - * - * @param constraints the constraints to use - */ - public static synchronized void setStreamReadConstraints(StreamReadConstraints constraints) { - streamReadConstraints = constraints; - rebuildObjectMappers(); - } - - /** - * Gets the current stream read constraints. - * - * @return the current constraints - */ - public static StreamReadConstraints getStreamReadConstraints() { - return streamReadConstraints; + PRETTY_SERIALIZER.registerModule(prettySerializerModule); } /** @@ -109,20 +62,46 @@ public class JsonMetadata { } /** - * Read metadata from reader. This does not close the reader. + * Read metadata from reader. + * <p> + * This does not close the reader. + * <p> + * This will reset the OBJECT_MAPPER if the max string length differs from that in TikaConfig. * * @param reader reader to read from - * @return Metadata or null if reader is null + * @return Metadata or null if nothing could be read from the reader * @throws IOException in case of parse failure or IO failure with Reader */ public static Metadata fromJson(Reader reader) throws IOException { if (reader == null) { return null; } + if (OBJECT_MAPPER + .getFactory() + .streamReadConstraints() + .getMaxStringLength() != GlobalSettings.getMaxJsonStringFieldLength()) { + OBJECT_MAPPER = buildObjectMapper(GlobalSettings.getMaxJsonStringFieldLength()); + } return OBJECT_MAPPER.readValue(reader, Metadata.class); } public static void setPrettyPrinting(boolean prettyPrint) { PRETTY_PRINT = prettyPrint; } + + static ObjectMapper buildObjectMapper(int maxStringLen) { + JsonFactory factory = new JsonFactory(); + factory.setStreamReadConstraints(StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(maxStringLen) + .maxNumberLength(500) + .build()); + ObjectMapper objectMapper = new ObjectMapper(factory); + SimpleModule baseModule = new SimpleModule(); + baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); + baseModule.addSerializer(Metadata.class, new MetadataSerializer()); + objectMapper.registerModule(baseModule); + return objectMapper; + } } diff --git a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java index 7611cdfea..71427947b 100644 --- a/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java +++ b/tika-serialization/src/main/java/org/apache/tika/serialization/JsonMetadataList.java @@ -16,6 +16,8 @@ */ package org.apache.tika.serialization; +import static org.apache.tika.serialization.JsonMetadata.buildObjectMapper; + import java.io.IOException; import java.io.Reader; import java.io.Writer; @@ -27,69 +29,36 @@ import com.fasterxml.jackson.core.type.TypeReference; import com.fasterxml.jackson.databind.ObjectMapper; import com.fasterxml.jackson.databind.module.SimpleModule; +import org.apache.tika.config.GlobalSettings; import org.apache.tika.metadata.Metadata; public class JsonMetadataList { static volatile boolean PRETTY_PRINT = false; - /** - * Default stream read constraints for metadata list serialization. - */ - private static final StreamReadConstraints DEFAULT_CONSTRAINTS = StreamReadConstraints - .builder() - .maxNestingDepth(10) - .maxStringLength(20_000_000) - .maxNumberLength(500) - .build(); - - private static volatile StreamReadConstraints streamReadConstraints = DEFAULT_CONSTRAINTS; - private static volatile ObjectMapper OBJECT_MAPPER; - private static volatile ObjectMapper PRETTY_SERIALIZER; + private static ObjectMapper OBJECT_MAPPER; + private static final ObjectMapper PRETTY_SERIALIZER; static { - rebuildObjectMappers(); - } - - private static void rebuildObjectMappers() { JsonFactory factory = new JsonFactory(); - factory.setStreamReadConstraints(streamReadConstraints); - - ObjectMapper mapper = new ObjectMapper(factory); + factory.setStreamReadConstraints(StreamReadConstraints + .builder() + .maxNestingDepth(10) + .maxStringLength(GlobalSettings.getMaxJsonStringFieldLength()) + .maxNumberLength(500) +// .maxDocumentLength(1000000) + .build()); + OBJECT_MAPPER = new ObjectMapper(factory); SimpleModule baseModule = new SimpleModule(); baseModule.addDeserializer(Metadata.class, new MetadataDeserializer()); baseModule.addSerializer(Metadata.class, new MetadataSerializer()); - mapper.registerModule(baseModule); - OBJECT_MAPPER = mapper; + OBJECT_MAPPER.registerModule(baseModule); - ObjectMapper prettyMapper = new ObjectMapper(factory); + PRETTY_SERIALIZER = new ObjectMapper(factory); SimpleModule prettySerializerModule = new SimpleModule(); prettySerializerModule.addSerializer(Metadata.class, new MetadataSerializer(true)); - prettyMapper.registerModule(prettySerializerModule); - PRETTY_SERIALIZER = prettyMapper; - } - - /** - * Sets the stream read constraints for JSON parsing of metadata lists. - * This affects all subsequent calls to {@link #fromJson(Reader)}. - * <p> - * Typically called by TikaLoader during initialization based on the - * "metadata-list" configuration section. - * - * @param constraints the constraints to use - */ - public static synchronized void setStreamReadConstraints(StreamReadConstraints constraints) { - streamReadConstraints = constraints; - rebuildObjectMappers(); - } + PRETTY_SERIALIZER.registerModule(prettySerializerModule); - /** - * Gets the current stream read constraints. - * - * @return the current constraints - */ - public static StreamReadConstraints getStreamReadConstraints() { - return streamReadConstraints; } /** @@ -120,16 +89,21 @@ public class JsonMetadataList { } /** - * Read metadata from reader. This does not close the reader. + * Read metadata from reader. This does not close the reader * - * @param reader the reader to read from - * @return Metadata list or null if reader is null + * @param reader + * @return Metadata or null if nothing could be read from the reader * @throws IOException in case of parse failure or IO failure with Reader */ public static List<Metadata> fromJson(Reader reader) throws IOException { if (reader == null) { return null; } + if (OBJECT_MAPPER.getFactory().streamReadConstraints().getMaxStringLength() + != GlobalSettings.getMaxJsonStringFieldLength()) { + OBJECT_MAPPER = buildObjectMapper(GlobalSettings.getMaxJsonStringFieldLength()); + } + return OBJECT_MAPPER.readValue(reader, new TypeReference<List<Metadata>>(){}); } diff --git a/tika-serialization/src/test/resources/configs/tika-config-json.json b/tika-serialization/src/test/resources/configs/tika-config-json.json index 3650aab64..8d1e5feb0 100644 --- a/tika-serialization/src/test/resources/configs/tika-config-json.json +++ b/tika-serialization/src/test/resources/configs/tika-config-json.json @@ -1,5 +1,3 @@ { - "metadata-list": { - "maxStringLength": 50000000 - } -} + "maxJsonStringFieldLength": 50000000 +} \ No newline at end of file diff --git a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json index 3650aab64..419a225e6 100644 --- a/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json +++ b/tika-server/tika-server-standard/src/test/resources/configs/tika-config-json.json @@ -1,5 +1,3 @@ { - "metadata-list": { - "maxStringLength": 50000000 - } + "maxJsonStringFieldLength": 50000000 }
