This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch transformerfactory-dry in repository https://gitbox.apache.org/repos/asf/tika.git
commit 0157809a6fbe9bd978ab024f40277078348af658 Author: tallison <[email protected]> AuthorDate: Thu Jul 31 17:35:37 2025 -0400 Add secure processing to transformer handlers throughout the codebase to avoid static analysis reports. --- .../src/main/java/org/apache/tika/cli/TikaCLI.java | 7 +++- .../src/main/java/org/apache/tika/gui/TikaGUI.java | 9 +++-- .../java/org/apache/tika/utils/XMLReaderUtils.java | 45 ++++++++++++++++++++-- .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 13 +++++-- .../apache/tika/parser/html/HtmlParserTest.java | 3 +- .../tika/parser/microsoft/OutlookParserTest.java | 7 ++-- .../parser/microsoft/ooxml/OOXMLParserTest.java | 3 +- .../org/apache/tika/sax/BoilerpipeHandlerTest.java | 3 +- .../tika/server/core/resource/TikaResource.java | 5 ++- 9 files changed, 74 insertions(+), 21 deletions(-) diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java index b23474d11..ad859c561 100644 --- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java +++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java @@ -106,6 +106,7 @@ import org.apache.tika.sax.WriteOutContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; import org.apache.tika.serialization.JsonMetadata; import org.apache.tika.serialization.JsonMetadataList; +import org.apache.tika.utils.XMLReaderUtils; import org.apache.tika.xmp.XMPMetadata; /** @@ -298,9 +299,11 @@ public class TikaCLI { * @throws TransformerConfigurationException if the transformer can not be created * @see <a href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a> */ - private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint) throws TransformerConfigurationException { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + private static TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, boolean prettyPrint) + throws TransformerConfigurationException, TikaException { + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); + handler .getTransformer() .setOutputProperty(OutputKeys.METHOD, method); diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java index d314b472c..e80e6f2d1 100644 --- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java +++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java @@ -91,6 +91,7 @@ import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; import org.apache.tika.serialization.JsonMetadataList; +import org.apache.tika.utils.XMLReaderUtils; /** * Simple Swing GUI for Apache Tika. You can drag and drop files on top @@ -498,8 +499,8 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener * @return HTML content handler * @throws TransformerConfigurationException if an error occurs */ - private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + private ContentHandler getHtmlHandler(Writer writer) throws TransformerConfigurationException, TikaException { + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler .getTransformer() @@ -573,8 +574,8 @@ public class TikaGUI extends JFrame implements ActionListener, HyperlinkListener return new BoilerpipeContentHandler(writer); } - private ContentHandler getXmlContentHandler(Writer writer) throws TransformerConfigurationException { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + private ContentHandler getXmlContentHandler(Writer writer) throws TransformerConfigurationException, TikaException { + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler .getTransformer() diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java index 9c27cfc1d..99ba32cc2 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java @@ -42,6 +42,7 @@ import javax.xml.transform.Transformer; import javax.xml.transform.TransformerConfigurationException; import javax.xml.transform.TransformerFactory; import javax.xml.transform.TransformerFactoryConfigurationError; +import javax.xml.transform.sax.SAXTransformerFactory; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -373,13 +374,51 @@ public class XMLReaderUtils implements Serializable { * @since Apache Tika 1.17 */ public static Transformer getTransformer() throws TikaException { + TransformerFactory transformerFactory = getTransformerFactory(); try { + return transformerFactory.newTransformer(); + } catch (TransformerConfigurationException e) { + throw new TikaException("Transformer not available", e); + } + } + + /** + * Returns a TransformerFactory. The factory is configured with + * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other + * settings to prevent XXE. + * + * @return TransformerFactory + * @throws TikaException + */ + public static TransformerFactory getTransformerFactory() throws TikaException { + try { + TransformerFactory transformerFactory = TransformerFactory.newInstance(); transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); - trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, - ""); - return transformerFactory.newTransformer(); + trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + return transformerFactory; + } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { + throw new TikaException("Transformer not available", e); + } + } + + /** + * Returns a SAXTransformerFactory. The factory is configured with + * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} and other + * settings to prevent XXE. + * + * @return TransformerFactory + * @throws TikaException + */ + public static SAXTransformerFactory getSAXTransformerFactory() throws TikaException { + try { + + SAXTransformerFactory transformerFactory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); + trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_DTD, ""); + trySetTransformerAttribute(transformerFactory, XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + return transformerFactory; } catch (TransformerConfigurationException | TransformerFactoryConfigurationError e) { throw new TikaException("Transformer not available", e); } diff --git a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java index 4eb5f0b01..63ad8a256 100644 --- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java +++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java @@ -67,6 +67,7 @@ import org.apache.tika.TikaGrpc; import org.apache.tika.config.Initializable; import org.apache.tika.config.Param; import org.apache.tika.exception.TikaConfigException; +import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.pipes.FetchEmitTuple; @@ -78,6 +79,7 @@ import org.apache.tika.pipes.fetcher.AbstractFetcher; import org.apache.tika.pipes.fetcher.FetchKey; import org.apache.tika.pipes.fetcher.config.AbstractConfig; import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer; +import org.apache.tika.utils.XMLReaderUtils; class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { private static final Logger LOG = LoggerFactory.getLogger(TikaGrpcServerImpl.class); @@ -115,11 +117,14 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { expiringFetcherStore = new ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(), pipesConfig.getStaleFetcherDelaySeconds()); this.tikaConfigPath = tikaConfigPath; - updateTikaConfig(); + try { + updateTikaConfig(); + } catch (TikaException e) { + throw new TikaConfigException("Problem updating tikaConfig", e); + } } - private void updateTikaConfig() - throws ParserConfigurationException, IOException, SAXException, TransformerException { + private void updateTikaConfig() throws ParserConfigurationException, IOException, SAXException, TransformerException, TikaException { Document tikaConfigDoc = DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath); @@ -149,7 +154,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase { FileWriter writer = new FileWriter(tikaConfigPath, StandardCharsets.UTF_8); StreamResult result = new StreamResult(writer); - TransformerFactory transformerFactory = TransformerFactory.newInstance(); + TransformerFactory transformerFactory = XMLReaderUtils.getTransformerFactory(); Transformer transformer = transformerFactory.newTransformer(); transformer.transform(source, result); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java index 3fb542076..25f9ddeb1 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java @@ -82,6 +82,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.WriteOutContentHandler; +import org.apache.tika.utils.XMLReaderUtils; public class HtmlParserTest extends TikaTest { @@ -710,7 +711,7 @@ public class HtmlParserTest extends TikaTest { * @throws Exception */ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java index a40e02b51..f7017379a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java @@ -46,6 +46,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.XMLReaderUtils; /** * Test case for parsing Outlook files. @@ -180,7 +181,7 @@ public class OutlookParserTest extends TikaTest { // Check the HTML version StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); @@ -229,7 +230,7 @@ public class OutlookParserTest extends TikaTest { // Check the HTML version StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); @@ -272,7 +273,7 @@ public class OutlookParserTest extends TikaTest { // Check the HTML version StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index a8169e7d8..fef9ef648 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -75,6 +75,7 @@ import org.apache.tika.parser.microsoft.OfficeParser; import org.apache.tika.parser.microsoft.OfficeParserConfig; import org.apache.tika.parser.microsoft.OfficeParserTest; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.XMLReaderUtils; public class OOXMLParserTest extends MultiThreadedTikaTest { @@ -860,7 +861,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { public void testEmbeddedPDF() throws Exception { Metadata metadata = new Metadata(); StringWriter sw = new StringWriter(); - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java index e66384cf6..855ee0a4d 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java @@ -36,6 +36,7 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.html.JSoupParser; import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler; +import org.apache.tika.utils.XMLReaderUtils; public class BoilerpipeHandlerTest extends TikaTest { /** @@ -158,7 +159,7 @@ public class BoilerpipeHandlerTest extends TikaTest { * @throws Exception */ private ContentHandler makeHtmlTransformer(Writer writer) throws Exception { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html"); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no"); diff --git a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java index 0be09be21..257a9e79e 100644 --- a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java +++ b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java @@ -87,6 +87,7 @@ import org.apache.tika.server.core.ServerStatus; import org.apache.tika.server.core.TikaServerConfig; import org.apache.tika.server.core.TikaServerParseException; import org.apache.tika.utils.ExceptionUtils; +import org.apache.tika.utils.XMLReaderUtils; @Path("/tika") public class TikaResource { @@ -635,7 +636,7 @@ public class TikaResource { ContentHandler content; try { - SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); + SAXTransformerFactory factory = XMLReaderUtils.getSAXTransformerFactory(); TransformerHandler handler = factory.newTransformerHandler(); handler .getTransformer() @@ -651,7 +652,7 @@ public class TikaResource { .setOutputProperty(OutputKeys.VERSION, "1.1"); handler.setResult(new StreamResult(writer)); content = new ExpandedTitleContentHandler(handler); - } catch (TransformerConfigurationException e) { + } catch (TransformerConfigurationException | TikaException e) { throw new WebApplicationException(e); }
