This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch transformerfactory-dry
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 0157809a6fbe9bd978ab024f40277078348af658
Author: tallison <[email protected]>
AuthorDate: Thu Jul 31 17:35:37 2025 -0400

    Add secure processing to transformer handlers throughout the codebase to 
avoid static analysis reports.
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |  7 +++-
 .../src/main/java/org/apache/tika/gui/TikaGUI.java |  9 +++--
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 45 ++++++++++++++++++++--
 .../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 13 +++++--
 .../apache/tika/parser/html/HtmlParserTest.java    |  3 +-
 .../tika/parser/microsoft/OutlookParserTest.java   |  7 ++--
 .../parser/microsoft/ooxml/OOXMLParserTest.java    |  3 +-
 .../org/apache/tika/sax/BoilerpipeHandlerTest.java |  3 +-
 .../tika/server/core/resource/TikaResource.java    |  5 ++-
 9 files changed, 74 insertions(+), 21 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java 
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index b23474d11..ad859c561 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -106,6 +106,7 @@ import org.apache.tika.sax.WriteOutContentHandler;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
 import org.apache.tika.serialization.JsonMetadata;
 import org.apache.tika.serialization.JsonMetadataList;
+import org.apache.tika.utils.XMLReaderUtils;
 import org.apache.tika.xmp.XMPMetadata;
 
 /**
@@ -298,9 +299,11 @@ public class TikaCLI {
      * @throws TransformerConfigurationException if the transformer can not be 
created
      * @see <a 
href="https://issues.apache.org/jira/browse/TIKA-277";>TIKA-277</a>
      */
-    private static TransformerHandler getTransformerHandler(OutputStream 
output, String method, String encoding, boolean prettyPrint) throws 
TransformerConfigurationException {
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+    private static TransformerHandler getTransformerHandler(OutputStream 
output, String method, String encoding, boolean prettyPrint)
+            throws TransformerConfigurationException, TikaException {
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
+
         handler
                 .getTransformer()
                 .setOutputProperty(OutputKeys.METHOD, method);
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java 
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index d314b472c..e80e6f2d1 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -91,6 +91,7 @@ import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.XHTMLContentHandler;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
 import org.apache.tika.serialization.JsonMetadataList;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -498,8 +499,8 @@ public class TikaGUI extends JFrame implements 
ActionListener, HyperlinkListener
      * @return HTML content handler
      * @throws TransformerConfigurationException if an error occurs
      */
-    private ContentHandler getHtmlHandler(Writer writer) throws 
TransformerConfigurationException {
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+    private ContentHandler getHtmlHandler(Writer writer) throws 
TransformerConfigurationException, TikaException {
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler
                 .getTransformer()
@@ -573,8 +574,8 @@ public class TikaGUI extends JFrame implements 
ActionListener, HyperlinkListener
         return new BoilerpipeContentHandler(writer);
     }
 
-    private ContentHandler getXmlContentHandler(Writer writer) throws 
TransformerConfigurationException {
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+    private ContentHandler getXmlContentHandler(Writer writer) throws 
TransformerConfigurationException, TikaException {
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler
                 .getTransformer()
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java 
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 9c27cfc1d..99ba32cc2 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -42,6 +42,7 @@ import javax.xml.transform.Transformer;
 import javax.xml.transform.TransformerConfigurationException;
 import javax.xml.transform.TransformerFactory;
 import javax.xml.transform.TransformerFactoryConfigurationError;
+import javax.xml.transform.sax.SAXTransformerFactory;
 
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -373,13 +374,51 @@ public class XMLReaderUtils implements Serializable {
      * @since Apache Tika 1.17
      */
     public static Transformer getTransformer() throws TikaException {
+        TransformerFactory transformerFactory = getTransformerFactory();
         try {
+            return transformerFactory.newTransformer();
+        } catch (TransformerConfigurationException e) {
+            throw new TikaException("Transformer not available", e);
+        }
+    }
+
+    /**
+     * Returns a TransformerFactory. The factory is configured with
+     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} 
and other
+     * settings to prevent XXE.
+     *
+     * @return TransformerFactory
+     * @throws TikaException
+     */
+    public static TransformerFactory getTransformerFactory() throws 
TikaException {
+        try {
+
             TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
             
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
             trySetTransformerAttribute(transformerFactory, 
XMLConstants.ACCESS_EXTERNAL_DTD, "");
-            trySetTransformerAttribute(transformerFactory, 
XMLConstants.ACCESS_EXTERNAL_STYLESHEET,
-                    "");
-            return transformerFactory.newTransformer();
+            trySetTransformerAttribute(transformerFactory, 
XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+            return transformerFactory;
+        } catch (TransformerConfigurationException | 
TransformerFactoryConfigurationError e) {
+            throw new TikaException("Transformer not available", e);
+        }
+    }
+
+    /**
+     * Returns a SAXTransformerFactory. The factory is configured with
+     * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing} 
and other
+     * settings to prevent XXE.
+     *
+     * @return TransformerFactory
+     * @throws TikaException
+     */
+    public static SAXTransformerFactory getSAXTransformerFactory() throws 
TikaException {
+        try {
+
+            SAXTransformerFactory transformerFactory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+            
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+            trySetTransformerAttribute(transformerFactory, 
XMLConstants.ACCESS_EXTERNAL_DTD, "");
+            trySetTransformerAttribute(transformerFactory, 
XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+            return transformerFactory;
         } catch (TransformerConfigurationException | 
TransformerFactoryConfigurationError e) {
             throw new TikaException("Transformer not available", e);
         }
diff --git 
a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java 
b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
index 4eb5f0b01..63ad8a256 100644
--- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
+++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
@@ -67,6 +67,7 @@ import org.apache.tika.TikaGrpc;
 import org.apache.tika.config.Initializable;
 import org.apache.tika.config.Param;
 import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.pipes.FetchEmitTuple;
@@ -78,6 +79,7 @@ import org.apache.tika.pipes.fetcher.AbstractFetcher;
 import org.apache.tika.pipes.fetcher.FetchKey;
 import org.apache.tika.pipes.fetcher.config.AbstractConfig;
 import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer;
+import org.apache.tika.utils.XMLReaderUtils;
 
 class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
     private static final Logger LOG = 
LoggerFactory.getLogger(TikaGrpcServerImpl.class);
@@ -115,11 +117,14 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
         expiringFetcherStore = new 
ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(),
                 pipesConfig.getStaleFetcherDelaySeconds());
         this.tikaConfigPath = tikaConfigPath;
-        updateTikaConfig();
+        try {
+            updateTikaConfig();
+        } catch (TikaException e) {
+            throw new TikaConfigException("Problem updating tikaConfig", e);
+        }
     }
 
-    private void updateTikaConfig()
-            throws ParserConfigurationException, IOException, SAXException, 
TransformerException {
+    private void updateTikaConfig() throws ParserConfigurationException, 
IOException, SAXException, TransformerException, TikaException {
         Document tikaConfigDoc =
                 
DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath);
 
@@ -149,7 +154,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
         FileWriter writer = new FileWriter(tikaConfigPath, 
StandardCharsets.UTF_8);
         StreamResult result = new StreamResult(writer);
 
-        TransformerFactory transformerFactory = 
TransformerFactory.newInstance();
+        TransformerFactory transformerFactory = 
XMLReaderUtils.getTransformerFactory();
         Transformer transformer = transformerFactory.newTransformer();
         transformer.transform(source, result);
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 3fb542076..25f9ddeb1 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -82,6 +82,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.TeeContentHandler;
 import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 public class HtmlParserTest extends TikaTest {
 
@@ -710,7 +711,7 @@ public class HtmlParserTest extends TikaTest {
      * @throws Exception
      */
     private ContentHandler makeHtmlTransformer(Writer writer) throws Exception 
{
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index a40e02b51..f7017379a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BasicContentHandlerFactory;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 /**
  * Test case for parsing Outlook files.
@@ -180,7 +181,7 @@ public class OutlookParserTest extends TikaTest {
 
         // Check the HTML version
         StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -229,7 +230,7 @@ public class OutlookParserTest extends TikaTest {
 
         // Check the HTML version
         StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -272,7 +273,7 @@ public class OutlookParserTest extends TikaTest {
 
         // Check the HTML version
         StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a8169e7d8..fef9ef648 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -75,6 +75,7 @@ import org.apache.tika.parser.microsoft.OfficeParser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.parser.microsoft.OfficeParserTest;
 import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 public class OOXMLParserTest extends MultiThreadedTikaTest {
 
@@ -860,7 +861,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
     public void testEmbeddedPDF() throws Exception {
         Metadata metadata = new Metadata();
         StringWriter sw = new StringWriter();
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
index e66384cf6..855ee0a4d 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.html.JSoupParser;
 import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
 
 public class BoilerpipeHandlerTest extends TikaTest {
     /**
@@ -158,7 +159,7 @@ public class BoilerpipeHandlerTest extends TikaTest {
      * @throws Exception
      */
     private ContentHandler makeHtmlTransformer(Writer writer) throws Exception 
{
-        SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+        SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
         TransformerHandler handler = factory.newTransformerHandler();
         handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
         handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 0be09be21..257a9e79e 100644
--- 
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++ 
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -87,6 +87,7 @@ import org.apache.tika.server.core.ServerStatus;
 import org.apache.tika.server.core.TikaServerConfig;
 import org.apache.tika.server.core.TikaServerParseException;
 import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.XMLReaderUtils;
 
 @Path("/tika")
 public class TikaResource {
@@ -635,7 +636,7 @@ public class TikaResource {
             ContentHandler content;
 
             try {
-                SAXTransformerFactory factory = (SAXTransformerFactory) 
SAXTransformerFactory.newInstance();
+                SAXTransformerFactory factory = 
XMLReaderUtils.getSAXTransformerFactory();
                 TransformerHandler handler = factory.newTransformerHandler();
                 handler
                         .getTransformer()
@@ -651,7 +652,7 @@ public class TikaResource {
                         .setOutputProperty(OutputKeys.VERSION, "1.1");
                 handler.setResult(new StreamResult(writer));
                 content = new ExpandedTitleContentHandler(handler);
-            } catch (TransformerConfigurationException e) {
+            } catch (TransformerConfigurationException | TikaException e) {
                 throw new WebApplicationException(e);
             }
 

Reply via email to