This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4585-simplify-serialization in repository https://gitbox.apache.org/repos/asf/tika.git
commit 05e3fc4d9056330f51002aee96519970127bea49 Author: tallison <[email protected]> AuthorDate: Sat Dec 20 23:29:18 2025 -0500 TIKA-4585 -- further progress --- .../test/java/org/apache/tika/cli/TikaCLITest.java | 13 +++--- .../java/org/apache/tika/parser/ParseContext.java | 13 +++++- .../filter/AttachmentCountingListFilter.java | 47 ++++++++++++++++++++++ .../tika/metadata/filter/MockUpperCaseFilter.java | 40 ++++++++++++++++++ .../tika/server/core/resource/TikaResource.java | 14 ++++++- .../core/TikaServerPipesIntegrationTest.java | 2 +- .../standard/RecursiveMetadataResourceTest.java | 5 ++- .../tika/server/standard/TikaParsersTest.java | 10 +++-- .../apache/tika/server/standard/TikaPipesTest.java | 4 +- 9 files changed, 128 insertions(+), 20 deletions(-) diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 73e167d60..f09338059 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -38,8 +38,6 @@ import java.util.HashSet; import java.util.List; import java.util.Set; -import org.jetbrains.annotations.NotNull; -import org.jetbrains.annotations.Nullable; import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Disabled; @@ -237,7 +235,7 @@ public class TikaCLITest { public void testJsonMetadataPrettyPrintOutput() throws Exception { String json = getParamOutContent("--json", "-r", resourcePrefix + "testJsonMultipleInts.html"); - assertTrue(json.contains("org.apache.tika.parser.CompositeParser\", \"org.apache.tika.parser.html.JSoupParser")); + assertTrue(json.contains("org.apache.tika.parser.DefaultParser\", \"org.apache.tika.parser.html.JSoupParser")); //test pretty-print alphabetic sort of keys int enc = json.indexOf("\"Content-Encoding\""); int fb = json.indexOf("fb:admins"); @@ -413,22 +411,23 @@ public class TikaCLITest { final Set<String> names = new HashSet<>(); Files.walkFileTree(extractDir, new FileVisitor<Path>() { @Override - public @NotNull FileVisitResult preVisitDirectory(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException { + public FileVisitResult preVisitDirectory(Path path, BasicFileAttributes basicFileAttributes) throws IOException { return FileVisitResult.CONTINUE; } - public @NotNull FileVisitResult visitFile(Path path, @NotNull BasicFileAttributes basicFileAttributes) throws IOException { + @Override + public FileVisitResult visitFile(Path path, BasicFileAttributes basicFileAttributes) throws IOException { names.add(extractDir.relativize(path).toString().replace('\\', '/')); return FileVisitResult.CONTINUE; } @Override - public @NotNull FileVisitResult visitFileFailed(Path path, @NotNull IOException e) throws IOException { + public FileVisitResult visitFileFailed(Path path, IOException e) throws IOException { return FileVisitResult.CONTINUE; } @Override - public @NotNull FileVisitResult postVisitDirectory(Path path, @Nullable IOException e) throws IOException { + public FileVisitResult postVisitDirectory(Path path, IOException e) throws IOException { return FileVisitResult.CONTINUE; } }); diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java index 137fa9bca..c4587c1fd 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java +++ b/tika-core/src/main/java/org/apache/tika/parser/ParseContext.java @@ -51,8 +51,9 @@ public class ParseContext implements Serializable { /** * Cache of resolved objects from jsonConfigs, keyed by component name. * This is ignored during serialization to preserve round-trip fidelity. + * Note: Not final because Java serialization bypasses constructor initialization. */ - private final transient Map<String, Object> resolvedConfigs = new HashMap<>(); + private transient Map<String, Object> resolvedConfigs = new HashMap<>(); /** * Adds the given value to the context as an implementation of the given @@ -120,7 +121,9 @@ public class ParseContext implements Serializable { jsonConfigs.put(name, config); } else { jsonConfigs.remove(name); - resolvedConfigs.remove(name); + if (resolvedConfigs != null) { + resolvedConfigs.remove(name); + } } } @@ -178,6 +181,9 @@ public class ParseContext implements Serializable { */ @SuppressWarnings("unchecked") public <T> T getResolvedConfig(String name) { + if (resolvedConfigs == null) { + return null; + } return (T) resolvedConfigs.get(name); } @@ -191,6 +197,9 @@ public class ParseContext implements Serializable { * @since Apache Tika 4.0 */ public void setResolvedConfig(String name, Object config) { + if (resolvedConfigs == null) { + resolvedConfigs = new HashMap<>(); + } if (config != null) { resolvedConfigs.put(name, config); } else { diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java new file mode 100644 index 000000000..fdf4f1c22 --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/AttachmentCountingListFilter.java @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import java.util.List; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.exception.TikaException; +import org.apache.tika.metadata.Metadata; + +@TikaComponent +public class AttachmentCountingListFilter extends MetadataFilter { + + private Integer count = 0; + + @Override + public List<Metadata> filter(List<Metadata> metadataList) throws TikaException { + if (metadataList == null || metadataList.isEmpty()) { + return metadataList; + } + metadataList.get(0).set("X-TIKA:attachment_count", Integer.toString(metadataList.size() - 1)); + count += metadataList.size(); + return metadataList; + } + + public Integer getCount() { + return count; + } + + public void setCount(Integer count) { + this.count = count; + } +} diff --git a/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java new file mode 100644 index 000000000..a57b5dd7d --- /dev/null +++ b/tika-pipes/tika-pipes-integration-tests/src/test/java/org/apache/tika/metadata/filter/MockUpperCaseFilter.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.metadata.filter; + +import java.util.Locale; + +import org.apache.tika.config.TikaComponent; +import org.apache.tika.metadata.Metadata; + +/** + * Mock Filter for testing uppercasing of all values + */ +@TikaComponent +public class MockUpperCaseFilter extends MetadataFilterBase { + + @Override + protected void filter(Metadata metadata) { + for (String n : metadata.names()) { + String[] vals = metadata.getValues(n); + metadata.remove(n); + for (String val : vals) { + metadata.add(n, val.toUpperCase(Locale.US)); + } + } + } +} diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 71327764a..4729e66a4 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -62,6 +62,7 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.apache.tika.Tika; +import org.apache.tika.config.JsonConfig; import org.apache.tika.config.TikaTaskTimeout; import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.exception.EncryptedDocumentException; @@ -78,8 +79,8 @@ import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ExpandedTitleContentHandler; import org.apache.tika.sax.RichTextContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; -import org.apache.tika.serialization.serdes.ParseContextDeserializer; import org.apache.tika.serialization.ParseContextUtils; +import org.apache.tika.serialization.serdes.ParseContextDeserializer; import org.apache.tika.server.core.InputStreamFactory; import org.apache.tika.server.core.ServerStatus; import org.apache.tika.server.core.TikaServerConfig; @@ -148,8 +149,17 @@ public class TikaResource { ObjectMapper mapper = new ObjectMapper(); JsonNode root = mapper.readTree(configJson); // Use root directly - the JSON should contain parser configs at the top level - ParseContext configuredContext = ParseContextDeserializer.readParseContext(root); + ParseContext configuredContext = ParseContextDeserializer.readParseContext(root, mapper); + + // Copy jsonConfigs first (for SelfConfiguring parsers like PDFParser) + for (Map.Entry<String, JsonConfig> entry : configuredContext.getJsonConfigs().entrySet()) { + context.setJsonConfig(entry.getKey(), entry.getValue()); + } + + // Then resolve all configs to typed objects ParseContextUtils.resolveAll(configuredContext, Thread.currentThread().getContextClassLoader()); + + // Copy resolved typed objects from contextMap for (Map.Entry<String, Object> entry : configuredContext.getContextMap().entrySet()) { try { Class<?> clazz = Class.forName(entry.getKey()); diff --git a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java index 5ba494c44..e7957c9df 100644 --- a/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java +++ b/tika-server/tika-server-core/src/test/java/org/apache/tika/server/core/TikaServerPipesIntegrationTest.java @@ -222,7 +222,7 @@ public class TikaServerPipesIntegrationTest extends IntegrationTestBase { private String getJsonStringWithTimeout(String fileName, long timeoutMillis) throws IOException { ParseContext parseContext = new ParseContext(); parseContext.set(HandlerConfig.class, DEFAULT_HANDLER_CONFIG); - parseContext.addConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}"); + parseContext.setJsonConfig("tika-task-timeout", "{\"timeoutMillis\":" + timeoutMillis + "}"); FetchEmitTuple t = new FetchEmitTuple(fileName, new FetchKey(CXFTestBase.FETCHER_ID, fileName), diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java index 33aae76b8..8dcd90a29 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/RecursiveMetadataResourceTest.java @@ -118,9 +118,10 @@ public class RecursiveMetadataResourceTest extends CXFTestBase { .get(0) .getValues(TikaCoreProperties.TIKA_PARSED_BY); //make sure the CompressorParser doesn't show up here - assertEquals(2, parsedBy.length); + assertEquals(3, parsedBy.length); assertEquals("org.apache.tika.parser.CompositeParser", parsedBy[0]); - assertEquals("org.apache.tika.parser.microsoft.ooxml.OOXMLParser", parsedBy[1]); + assertEquals("org.apache.tika.parser.DefaultParser", parsedBy[1]); + assertEquals("org.apache.tika.parser.microsoft.ooxml.OOXMLParser", parsedBy[2]); //test that the rest is as it should be assertEquals(12, metadataList.size()); diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java index 64a5e090e..1eae7ba45 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java @@ -100,9 +100,9 @@ public class TikaParsersTest extends CXFTestBase { String text = getStringFromInputStream((InputStream) response.getEntity()); assertContains("Composite", text); - assertContains("<h3>OpusParser", text); - assertContains("<h3>PackageParser", text); - assertContains("<h3>OOXMLParser", text); + assertContains("<h4>OpusParser", text); + assertContains("<h4>PackageParser", text); + assertContains("<h4>OOXMLParser", text); assertContains(OpusParser.class.getName(), text); assertContains(PackageParser.class.getName(), text); @@ -145,7 +145,9 @@ public class TikaParsersTest extends CXFTestBase { assertEquals(Boolean.TRUE, json.get("composite")); // At least 20 child parsers which aren't composite, except for CompositeExternalParser - List<Object> children = (List) json.get("children"); + List<Object> wrapper = (List) json.get("children"); + Map<String, Object> firstItem = (Map) wrapper.get(0); + List<Object> children = (List) firstItem.get("children"); assertTrue(children.size() >= 2); boolean hasOpus = false, hasOOXML = false, hasZip = false; int nonComposite = 0; diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java index ac82d820a..3761fb0bf 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaPipesTest.java @@ -182,7 +182,7 @@ public class TikaPipesTest extends CXFTestBase { public void testConcatenated() throws Exception { ParseContext parseContext = new ParseContext(); // Use addConfig with JSON for handler-config - parseContext.addConfig("handler-config", + parseContext.setJsonConfig("handler-config", "{\"type\": \"TEXT\", \"parseMode\": \"CONCATENATE\", \"writeLimit\": -1, \"maxEmbeddedResources\": -1, \"throwOnWriteLimitReached\": true}"); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, "test_recursive_embedded.docx"), @@ -212,7 +212,7 @@ public class TikaPipesTest extends CXFTestBase { public void testPDFConfig() throws Exception { ParseContext parseContext = new ParseContext(); // Configure PDFParser via JSON config (pdf-parser is self-configuring) - parseContext.addConfig("pdf-parser", "{\"sortByPosition\": true}"); + parseContext.setJsonConfig("pdf-parser", "{\"sortByPosition\": true}"); FetchEmitTuple t = new FetchEmitTuple("myId", new FetchKey(FETCHER_ID, TEST_TWO_BOXES_PDF), new EmitKey(EMITTER_JSON_ID, ""), new Metadata(), parseContext);
