This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4472 in repository https://gitbox.apache.org/repos/asf/tika.git
commit ab0db181680382d550996e39a28b70e942a2a8ef Author: tallison <[email protected]> AuthorDate: Thu Aug 21 10:46:45 2025 -0400 TIKA-4472 -- extract macros by default in tika-app when parsing a single file --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 38 +++++----------- .../resources/tika-config-default-single-file.xml | 49 +++++++++++++++++++++ .../java/org/apache/tika/cli/TikaCLIAsyncTest.java | 2 +- .../test/java/org/apache/tika/cli/TikaCLITest.java | 18 +++++++- .../test/resources/test-data/testPDFPackage.pdf | Bin 0 -> 92359 bytes .../test/resources/test-data/testPPT_macros.ppt | Bin 0 -> 88064 bytes 6 files changed, 79 insertions(+), 28 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index aefc03660..96276935b 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -95,7 +95,6 @@ import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.PasswordProvider; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.parser.digestutils.CommonsDigester; -import org.apache.tika.parser.pdf.PDFParserConfig; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.ContentHandlerFactory; @@ -339,21 +338,6 @@ public class TikaCLI { return false; } - private void configurePDFExtractSettings() { - if (configFilePath == null && context.get(PDFParserConfig.class) == null) { - PDFParserConfig pdfParserConfig = new PDFParserConfig(); - pdfParserConfig.setExtractInlineImages(true); - pdfParserConfig.setExtractIncrementalUpdateInfo(true); - pdfParserConfig.setParseIncrementalUpdates(true); - String warn = "As a convenience, TikaCLI has turned on extraction of\n" + - "inline images and parsing of incremental updates for the PDFParser (TIKA-2374, " + - "TIKA-4017 and TIKA-4354).\n" + - "This is not the default behavior in Tika generally or in tika-server."; - LOG.info(warn); - context.set(PDFParserConfig.class, pdfParserConfig); - } - } - public void process(String arg) throws Exception { if (arg.equals("-?") || arg.equals("--help")) { pipeMode = false; @@ -478,7 +462,6 @@ public class TikaCLI { } else { url = new URL(arg); } - configurePDFExtractSettings(); if (recursiveJSON) { handleRecursiveJson(url, System.out); } else { @@ -669,17 +652,21 @@ public class TikaCLI { } private void configure() throws TikaException, IOException, SAXException { - + if (configFilePath != null) { + config = new TikaConfig(new File(configFilePath)); + } else { + String warn = "As a convenience, TikaCLI has turned on several non-default features\n" + + "as specified in tika-app/src/main/resources/tika-config-default-single-file.xml.\n" + + "See: TIKA-2374, TIKA-4017, TIKA-4354 and TIKA-4472).\n" + + "This is not the default behavior in Tika generally or in tika-server."; + LOG.info(warn); + try (InputStream is = getClass().getResourceAsStream("/tika-config-default-single-file.xml")) { + config = new TikaConfig(is); + } + } if (networkURI != null) { parser = new NetworkParser(networkURI); - config = TikaConfig.getDefaultConfig(); } else { - if (configFilePath != null) { - config = new TikaConfig(new File(configFilePath)); - } else { - config = TikaConfig.getDefaultConfig(); - } - parser = new AutoDetectParser(config); if (digester != null) { parser = new DigestingParser(parser, digester, false); @@ -1080,7 +1067,6 @@ public class TikaCLI { private class FileEmbeddedDocumentExtractor implements EmbeddedDocumentExtractor { - private final TikaConfig config = TikaConfig.getDefaultConfig(); private final EmbeddedStreamTranslator embeddedStreamTranslator = new DefaultEmbeddedStreamTranslator(); private int count = 0; diff --git a/tika-app/src/main/resources/tika-config-default-single-file.xml b/tika-app/src/main/resources/tika-config-default-single-file.xml new file mode 100644 index 000000000..696b555a8 --- /dev/null +++ b/tika-app/src/main/resources/tika-config-default-single-file.xml @@ -0,0 +1,49 @@ +<?xml version="1.0" encoding="UTF-8" standalone="no" ?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. +--> +<properties> + <service-loader initializableProblemHandler="throw"/> + <parsers> + <parser class="org.apache.tika.parser.DefaultParser"> + <parser-exclude class="org.apache.tika.parser.pdf.PDFParser"/> + <parser-exclude class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"/> + <parser-exclude class="org.apache.tika.parser.microsoft.OfficeParser"/> + </parser> + <parser class="org.apache.tika.parser.pdf.PDFParser"> + <params> + <param name="extractActions" type="bool">true</param> + <param name="extractInlineImages" type="bool">true</param> + <param name="extractIncrementalUpdateInfo" type="bool">true</param> + <param name="parseIncrementalUpdates" type="bool">true</param> + </params> + </parser> + <parser class="org.apache.tika.parser.microsoft.ooxml.OOXMLParser"> + <params> + <param name="includeDeletedContent" type="bool">true</param> + <param name="includeMoveFromContent" type="bool">true</param> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + <parser class="org.apache.tika.parser.microsoft.OfficeParser"> + <params> + <param name="extractMacros" type="bool">true</param> + </params> + </parser> + </parsers> +</properties> diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java index 072f2c7d7..56e1289b8 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLIAsyncTest.java @@ -120,7 +120,7 @@ public class TikaCLIAsyncTest { json++; } } - assertEquals(18, json); + assertEquals(20, json); } private void checkForPrettyPrint(File f) throws IOException { diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java index 13bf4153d..2195685d7 100644 --- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java +++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java @@ -232,7 +232,8 @@ public class TikaCLITest { public void testJsonMetadataPrettyPrintOutput() throws Exception { String json = getParamOutContent("--json", "-r", resourcePrefix + "testJsonMultipleInts.html"); - assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [ \"org.apache.tika.parser.DefaultParser\", " + "\"org.apache.tika.parser.html.JSoupParser\" ],")); + assertTrue(json.contains("\"X-TIKA:Parsed-By\" : [ \"org.apache.tika.parser.CompositeParser\", " + + "\"org.apache.tika.parser.DefaultParser\", \"org.apache.tika.parser.html.JSoupParser\" ],")); //test pretty-print alphabetic sort of keys int enc = json.indexOf("\"Content-Encoding\""); int fb = json.indexOf("fb:admins"); @@ -249,6 +250,21 @@ public class TikaCLITest { assertTrue(json.contains("embeddedResourceType\":\"VERSION\"")); } + @Test + public void testExtractJavascript() throws Exception { + String json = getParamOutContent("-J", resourcePrefix + "testPDFPackage.pdf"); + assertTrue(json.contains("type=\\\"PDActionJavaScript\\\"")); + assertTrue(json.contains("MACRO")); + assertTrue(json.contains("NAMES_TREE")); + } + + @Test + public void testMacros() throws Exception { + String json = getParamOutContent("-J", resourcePrefix + "testPPT_macros.ppt"); + assertTrue(json.contains("MACRO")); + assertTrue(json.contains("Module1")); + } + /** * Tests -l option of the cli * diff --git a/tika-app/src/test/resources/test-data/testPDFPackage.pdf b/tika-app/src/test/resources/test-data/testPDFPackage.pdf new file mode 100644 index 000000000..0cd2d487a Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPDFPackage.pdf differ diff --git a/tika-app/src/test/resources/test-data/testPPT_macros.ppt b/tika-app/src/test/resources/test-data/testPPT_macros.ppt new file mode 100644 index 000000000..7af9008dd Binary files /dev/null and b/tika-app/src/test/resources/test-data/testPPT_macros.ppt differ
