This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new cbe3c9b67 Add secure processing to transformer handlers throughout the
codebase to avoid static analysis reports. (#2292)
cbe3c9b67 is described below
commit cbe3c9b67a80237c03faaf85e9dfe189f5423a76
Author: Tim Allison <[email protected]>
AuthorDate: Fri Aug 1 10:19:34 2025 -0400
Add secure processing to transformer handlers throughout the codebase to
avoid static analysis reports. (#2292)
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 7 +++-
.../src/main/java/org/apache/tika/gui/TikaGUI.java | 9 +++--
.../java/org/apache/tika/utils/XMLReaderUtils.java | 45 ++++++++++++++++++++--
.../apache/tika/pipes/grpc/TikaGrpcServerImpl.java | 13 +++++--
.../apache/tika/parser/html/HtmlParserTest.java | 3 +-
.../tika/parser/microsoft/OutlookParserTest.java | 7 ++--
.../parser/microsoft/ooxml/OOXMLParserTest.java | 3 +-
.../org/apache/tika/sax/BoilerpipeHandlerTest.java | 3 +-
.../tika/server/core/resource/TikaResource.java | 5 ++-
9 files changed, 74 insertions(+), 21 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index b23474d11..ad859c561 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -106,6 +106,7 @@ import org.apache.tika.sax.WriteOutContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.serialization.JsonMetadata;
import org.apache.tika.serialization.JsonMetadataList;
+import org.apache.tika.utils.XMLReaderUtils;
import org.apache.tika.xmp.XMPMetadata;
/**
@@ -298,9 +299,11 @@ public class TikaCLI {
* @throws TransformerConfigurationException if the transformer can not be
created
* @see <a
href="https://issues.apache.org/jira/browse/TIKA-277">TIKA-277</a>
*/
- private static TransformerHandler getTransformerHandler(OutputStream
output, String method, String encoding, boolean prettyPrint) throws
TransformerConfigurationException {
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ private static TransformerHandler getTransformerHandler(OutputStream
output, String method, String encoding, boolean prettyPrint)
+ throws TransformerConfigurationException, TikaException {
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
+
handler
.getTransformer()
.setOutputProperty(OutputKeys.METHOD, method);
diff --git a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
index d314b472c..e80e6f2d1 100644
--- a/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
+++ b/tika-app/src/main/java/org/apache/tika/gui/TikaGUI.java
@@ -91,6 +91,7 @@ import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
import org.apache.tika.serialization.JsonMetadataList;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* Simple Swing GUI for Apache Tika. You can drag and drop files on top
@@ -498,8 +499,8 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
* @return HTML content handler
* @throws TransformerConfigurationException if an error occurs
*/
- private ContentHandler getHtmlHandler(Writer writer) throws
TransformerConfigurationException {
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ private ContentHandler getHtmlHandler(Writer writer) throws
TransformerConfigurationException, TikaException {
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler
.getTransformer()
@@ -573,8 +574,8 @@ public class TikaGUI extends JFrame implements
ActionListener, HyperlinkListener
return new BoilerpipeContentHandler(writer);
}
- private ContentHandler getXmlContentHandler(Writer writer) throws
TransformerConfigurationException {
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ private ContentHandler getXmlContentHandler(Writer writer) throws
TransformerConfigurationException, TikaException {
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler
.getTransformer()
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 9c27cfc1d..99ba32cc2 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -42,6 +42,7 @@ import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
+import javax.xml.transform.sax.SAXTransformerFactory;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -373,13 +374,51 @@ public class XMLReaderUtils implements Serializable {
* @since Apache Tika 1.17
*/
public static Transformer getTransformer() throws TikaException {
+ TransformerFactory transformerFactory = getTransformerFactory();
try {
+ return transformerFactory.newTransformer();
+ } catch (TransformerConfigurationException e) {
+ throw new TikaException("Transformer not available", e);
+ }
+ }
+
+ /**
+ * Returns a TransformerFactory. The factory is configured with
+ * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}
and other
+ * settings to prevent XXE.
+ *
+ * @return TransformerFactory
+ * @throws TikaException
+ */
+ public static TransformerFactory getTransformerFactory() throws
TikaException {
+ try {
+
TransformerFactory transformerFactory =
TransformerFactory.newInstance();
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
trySetTransformerAttribute(transformerFactory,
XMLConstants.ACCESS_EXTERNAL_DTD, "");
- trySetTransformerAttribute(transformerFactory,
XMLConstants.ACCESS_EXTERNAL_STYLESHEET,
- "");
- return transformerFactory.newTransformer();
+ trySetTransformerAttribute(transformerFactory,
XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+ return transformerFactory;
+ } catch (TransformerConfigurationException |
TransformerFactoryConfigurationError e) {
+ throw new TikaException("Transformer not available", e);
+ }
+ }
+
+ /**
+ * Returns a SAXTransformerFactory. The factory is configured with
+ * {@link XMLConstants#FEATURE_SECURE_PROCESSING secure XML processing}
and other
+ * settings to prevent XXE.
+ *
+ * @return TransformerFactory
+ * @throws TikaException
+ */
+ public static SAXTransformerFactory getSAXTransformerFactory() throws
TikaException {
+ try {
+
+ SAXTransformerFactory transformerFactory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+
transformerFactory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true);
+ trySetTransformerAttribute(transformerFactory,
XMLConstants.ACCESS_EXTERNAL_DTD, "");
+ trySetTransformerAttribute(transformerFactory,
XMLConstants.ACCESS_EXTERNAL_STYLESHEET, "");
+ return transformerFactory;
} catch (TransformerConfigurationException |
TransformerFactoryConfigurationError e) {
throw new TikaException("Transformer not available", e);
}
diff --git
a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
index 4eb5f0b01..63ad8a256 100644
--- a/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
+++ b/tika-grpc/src/main/java/org/apache/tika/pipes/grpc/TikaGrpcServerImpl.java
@@ -67,6 +67,7 @@ import org.apache.tika.TikaGrpc;
import org.apache.tika.config.Initializable;
import org.apache.tika.config.Param;
import org.apache.tika.exception.TikaConfigException;
+import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.pipes.FetchEmitTuple;
@@ -78,6 +79,7 @@ import org.apache.tika.pipes.fetcher.AbstractFetcher;
import org.apache.tika.pipes.fetcher.FetchKey;
import org.apache.tika.pipes.fetcher.config.AbstractConfig;
import org.apache.tika.pipes.fetcher.config.FetcherConfigContainer;
+import org.apache.tika.utils.XMLReaderUtils;
class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
private static final Logger LOG =
LoggerFactory.getLogger(TikaGrpcServerImpl.class);
@@ -115,11 +117,14 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
expiringFetcherStore = new
ExpiringFetcherStore(pipesConfig.getStaleFetcherTimeoutSeconds(),
pipesConfig.getStaleFetcherDelaySeconds());
this.tikaConfigPath = tikaConfigPath;
- updateTikaConfig();
+ try {
+ updateTikaConfig();
+ } catch (TikaException e) {
+ throw new TikaConfigException("Problem updating tikaConfig", e);
+ }
}
- private void updateTikaConfig()
- throws ParserConfigurationException, IOException, SAXException,
TransformerException {
+ private void updateTikaConfig() throws ParserConfigurationException,
IOException, SAXException, TransformerException, TikaException {
Document tikaConfigDoc =
DocumentBuilderFactory.newInstance().newDocumentBuilder().parse(tikaConfigPath);
@@ -149,7 +154,7 @@ class TikaGrpcServerImpl extends TikaGrpc.TikaImplBase {
FileWriter writer = new FileWriter(tikaConfigPath,
StandardCharsets.UTF_8);
StreamResult result = new StreamResult(writer);
- TransformerFactory transformerFactory =
TransformerFactory.newInstance();
+ TransformerFactory transformerFactory =
XMLReaderUtils.getTransformerFactory();
Transformer transformer = transformerFactory.newTransformer();
transformer.transform(source, result);
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
index 3fb542076..25f9ddeb1 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/test/java/org/apache/tika/parser/html/HtmlParserTest.java
@@ -82,6 +82,7 @@ import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.apache.tika.sax.WriteOutContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
public class HtmlParserTest extends TikaTest {
@@ -710,7 +711,7 @@ public class HtmlParserTest extends TikaTest {
* @throws Exception
*/
private ContentHandler makeHtmlTransformer(Writer writer) throws Exception
{
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
index a40e02b51..f7017379a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/OutlookParserTest.java
@@ -46,6 +46,7 @@ import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BasicContentHandlerFactory;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
/**
* Test case for parsing Outlook files.
@@ -180,7 +181,7 @@ public class OutlookParserTest extends TikaTest {
// Check the HTML version
StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -229,7 +230,7 @@ public class OutlookParserTest extends TikaTest {
// Check the HTML version
StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
@@ -272,7 +273,7 @@ public class OutlookParserTest extends TikaTest {
// Check the HTML version
StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "yes");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
index a8169e7d8..fef9ef648 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java
@@ -75,6 +75,7 @@ import org.apache.tika.parser.microsoft.OfficeParser;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.parser.microsoft.OfficeParserTest;
import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
public class OOXMLParserTest extends MultiThreadedTikaTest {
@@ -860,7 +861,7 @@ public class OOXMLParserTest extends MultiThreadedTikaTest {
public void testEmbeddedPDF() throws Exception {
Metadata metadata = new Metadata();
StringWriter sw = new StringWriter();
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "xml");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
index e66384cf6..855ee0a4d 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/sax/BoilerpipeHandlerTest.java
@@ -36,6 +36,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.html.JSoupParser;
import org.apache.tika.sax.boilerpipe.BoilerpipeContentHandler;
+import org.apache.tika.utils.XMLReaderUtils;
public class BoilerpipeHandlerTest extends TikaTest {
/**
@@ -158,7 +159,7 @@ public class BoilerpipeHandlerTest extends TikaTest {
* @throws Exception
*/
private ContentHandler makeHtmlTransformer(Writer writer) throws Exception
{
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler.getTransformer().setOutputProperty(OutputKeys.METHOD, "html");
handler.getTransformer().setOutputProperty(OutputKeys.INDENT, "no");
diff --git
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
index 0be09be21..257a9e79e 100644
---
a/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
+++
b/tika-server/tika-server-core/src/main/java/org/apache/tika/server/core/resource/TikaResource.java
@@ -87,6 +87,7 @@ import org.apache.tika.server.core.ServerStatus;
import org.apache.tika.server.core.TikaServerConfig;
import org.apache.tika.server.core.TikaServerParseException;
import org.apache.tika.utils.ExceptionUtils;
+import org.apache.tika.utils.XMLReaderUtils;
@Path("/tika")
public class TikaResource {
@@ -635,7 +636,7 @@ public class TikaResource {
ContentHandler content;
try {
- SAXTransformerFactory factory = (SAXTransformerFactory)
SAXTransformerFactory.newInstance();
+ SAXTransformerFactory factory =
XMLReaderUtils.getSAXTransformerFactory();
TransformerHandler handler = factory.newTransformerHandler();
handler
.getTransformer()
@@ -651,7 +652,7 @@ public class TikaResource {
.setOutputProperty(OutputKeys.VERSION, "1.1");
handler.setResult(new StreamResult(writer));
content = new ExpandedTitleContentHandler(handler);
- } catch (TransformerConfigurationException e) {
+ } catch (TransformerConfigurationException | TikaException e) {
throw new WebApplicationException(e);
}