This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4581-rm-metadata-filter-where-unneeded in repository https://gitbox.apache.org/repos/asf/tika.git
commit 3fd062d2c5e19368aaac86153b327c044fc83f8f Author: tallison <[email protected]> AuthorDate: Wed Dec 17 17:20:53 2025 -0500 TIKA-4581 - rm metadata filter where it isn't needed any more --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 3 +- .../tika/sax/RecursiveParserWrapperHandler.java | 22 +---------- .../tika/parser/RecursiveParserWrapperTest.java | 46 ---------------------- .../apache/tika/parser/image/JpegParserTest.java | 4 +- .../core/resource/RecursiveMetadataResource.java | 8 +--- 5 files changed, 6 insertions(+), 77 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index 411bd070a..a0f0ddb57 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -553,8 +553,7 @@ public class TikaCLI { private void handleRecursiveJson(URL url, OutputStream output) throws IOException, SAXException, TikaException { Metadata metadata = new Metadata(); RecursiveParserWrapper wrapper = new RecursiveParserWrapper(parser); - RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1, - tikaLoader.loadMetadataFilters()); + RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(getContentHandlerFactory(type), -1); try (TikaInputStream tis = TikaInputStream.get(url, metadata)) { wrapper.parse(tis, handler, metadata, context); } diff --git a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java index b65fdbd61..154d5733a 100644 --- a/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java +++ b/tika-core/src/main/java/org/apache/tika/sax/RecursiveParserWrapperHandler.java @@ -26,11 +26,8 @@ import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; -import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.filter.MetadataFilter; -import org.apache.tika.metadata.filter.NoOpFilter; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.utils.ParserUtils; @@ -51,13 +48,12 @@ import org.apache.tika.utils.ParserUtils; public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrapperHandler { protected final List<Metadata> metadataList = new LinkedList<>(); - private final MetadataFilter metadataFilter; /** * Create a handler with no limit on the number of embedded resources */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory) { - this(contentHandlerFactory, -1, NoOpFilter.NOOP_FILTER); + super(contentHandlerFactory, -1); } /** @@ -68,13 +64,7 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe */ public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, int maxEmbeddedResources) { - this(contentHandlerFactory, maxEmbeddedResources, NoOpFilter.NOOP_FILTER); - } - - public RecursiveParserWrapperHandler(ContentHandlerFactory contentHandlerFactory, - int maxEmbeddedResources, MetadataFilter metadataFilter) { super(contentHandlerFactory, maxEmbeddedResources); - this.metadataFilter = metadataFilter; } /** @@ -102,11 +92,6 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe throws SAXException { super.endEmbeddedDocument(contentHandler, metadata); addContent(contentHandler, metadata); - try { - metadataFilter.filter(List.of(metadata)); - } catch (TikaException e) { - throw new SAXException(e); - } if (metadata.size() > 0) { metadataList.add(ParserUtils.cloneMetadata(metadata)); @@ -122,11 +107,6 @@ public class RecursiveParserWrapperHandler extends AbstractRecursiveParserWrappe public void endDocument(ContentHandler contentHandler, Metadata metadata) throws SAXException { super.endDocument(contentHandler, metadata); addContent(contentHandler, metadata); - try { - metadataFilter.filter(List.of(metadata)); - } catch (TikaException e) { - throw new SAXException(e); - } if (metadata.size() > 0) { metadataList.add(0, ParserUtils.cloneMetadata(metadata)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 3cabccdfa..e009d65b9 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -20,7 +20,6 @@ import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; -import static org.junit.jupiter.api.Assertions.fail; import java.io.IOException; import java.io.InputStream; @@ -33,16 +32,13 @@ import org.apache.commons.io.input.ClosedInputStream; import org.apache.commons.io.input.ProxyInputStream; import org.junit.jupiter.api.Test; -import org.apache.tika.TikaLoaderHelper; import org.apache.tika.TikaTest; -import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.digest.DigestDef; import org.apache.tika.digest.Digester; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; -import org.apache.tika.metadata.filter.MetadataFilter; import org.apache.tika.parser.digestutils.CommonsDigester; import org.apache.tika.sax.AbstractRecursiveParserWrapperHandler; import org.apache.tika.sax.BasicContentHandlerFactory; @@ -393,48 +389,6 @@ public class RecursiveParserWrapperTest extends TikaTest { } - @Test - public void testIncludeFilter() throws Exception { - //TIKA-3137 - ParseContext context = new ParseContext(); - Metadata metadata = new Metadata(); - TikaLoader tikaLoader = TikaLoaderHelper.getLoader("TIKA-3137-include.json"); - Parser p = tikaLoader.loadAutoDetectParser(); - MetadataFilter metadataFilter = tikaLoader.loadMetadataFilters(); - RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p, true); - String path = "/test-documents/test_recursive_embedded.docx"; - ContentHandlerFactory contentHandlerFactory = - new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.TEXT, -1); - - RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(contentHandlerFactory, -1, - metadataFilter); - try (TikaInputStream tis = getResourceAsStream(path)) { - wrapper.parse(tis, handler, metadata, context); - } - List<Metadata> metadataList = handler.getMetadataList(); - assertEquals(5, metadataList.size()); - - Set<String> expectedKeys = new HashSet<>(); - expectedKeys.add("X-TIKA:content"); - expectedKeys.add("extended-properties:Application"); - expectedKeys.add("Content-Type"); - for (Metadata m : metadataList) { - if (m.get(Metadata.CONTENT_TYPE).equals("image/emf")) { - fail("emf should have been filtered out"); - } - if (m.get(Metadata.CONTENT_TYPE).startsWith("text/plain")) { - fail("text/plain should have been filtered out"); - } - assertTrue(m.names().length >= 2); - for (String n : m.names()) { - if (!expectedKeys.contains(n)) { - fail("didn't expect " + n); - } - } - } - } - @SuppressWarnings("deprecation") private List<Metadata> getMetadata(Metadata metadata, ContentHandlerFactory contentHandlerFactory, diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java index c811b9356..11af0ccef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/image/JpegParserTest.java @@ -45,12 +45,12 @@ public class JpegParserTest extends TikaTest { RecursiveParserWrapper wrapper = new RecursiveParserWrapper(p); RecursiveParserWrapperHandler handler = - new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1), 1000, metadataFilter); + new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1), 1000); try (InputStream is = getResourceAsStream("/test-documents/testJPEG_GEO_2.jpg")) { wrapper.parse(TikaInputStream.get(is), handler, new Metadata(), new ParseContext()); } List<Metadata> metadataList = handler.getMetadataList(); - + metadataList = metadataFilter.filter(metadataList); Metadata metadata = metadataList.get(0); // Geo tags should be there with 5dp, and not rounded assertEquals("51.575762", metadata.get(Metadata.LATITUDE)); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java index 3f71ae67e..61d6f1440 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/RecursiveMetadataResource.java @@ -72,9 +72,7 @@ public class RecursiveMetadataResource { BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources(), TikaResource - .getTikaLoader() - .loadMetadataFilters()); + handlerConfig.getMaxEmbeddedResources()); try { TikaResource.parse(wrapper, LOG, "/rmeta", tis, handler, metadata, context); } catch (TikaServerParseException e) { @@ -179,9 +177,7 @@ public class RecursiveMetadataResource { BasicContentHandlerFactory.HANDLER_TYPE type = handlerConfig.getType(); RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler(new BasicContentHandlerFactory(type, handlerConfig.getWriteLimit(), handlerConfig.isThrowOnWriteLimitReached(), context), - handlerConfig.getMaxEmbeddedResources(), TikaResource - .getTikaLoader() - .loadMetadataFilters()); + handlerConfig.getMaxEmbeddedResources()); try { TikaResource.parse(wrapper, LOG, "/rmeta/config", tis, handler, metadata, context); } catch (TikaServerParseException e) {
