This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4688-fix-truncated-ooxml in repository https://gitbox.apache.org/repos/asf/tika.git
commit ca8a9eb633740d8a9bee2f94e6dfa86a48f548a7 Author: tallison <[email protected]> AuthorDate: Wed Mar 11 08:57:49 2026 -0400 TIKA-4688 -- fix truncated ooxml regression --- .../parser/microsoft/ooxml/OOXMLExtractorFactory.java | 11 ++++++++++- .../tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java | 16 ++++++++++++++++ .../org/apache/tika/pipes/core/server/ParseHandler.java | 4 ++++ 3 files changed, 30 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java index 64ca412311..8558f37c21 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/OOXMLExtractorFactory.java @@ -58,6 +58,7 @@ import org.apache.tika.parser.microsoft.ooxml.xps.XPSTextExtractor; import org.apache.tika.parser.microsoft.ooxml.xslf.XSLFEventBasedPowerPointExtractor; import org.apache.tika.parser.microsoft.ooxml.xwpf.XWPFEventBasedWordExtractor; + /** * Figures out the correct {@link OOXMLExtractor} for the supplied document and * returns it. @@ -93,7 +94,15 @@ public class OOXMLExtractorFactory { if (tis.getOpenContainer() instanceof OPCPackageWrapper) { pkg = ((OPCPackageWrapper) tis.getOpenContainer()).getOPCPackage(); } else { - pkg = OPCPackage.open(tis.getPath().toString(), PackageAccess.READ); + // POI 5.x can throw InvalidOperationException (a RuntimeException + // extending OpenXML4JRuntimeException) for truncated/corrupt zip files. + // The detector should have salvaged if needed, but catch broadly here + // as a safety net. + try { + pkg = OPCPackage.open(tis.getPath().toString(), PackageAccess.READ); + } catch (RuntimeException e) { + throw new TikaException("Error opening OOXML file", e); + } tis.setOpenContainer(new OPCPackageWrapper(pkg)); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java index 179f3106f4..0a82cfe0df 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/TruncatedOOXMLTest.java @@ -56,6 +56,22 @@ public class TruncatedOOXMLTest extends TikaTest { } + @Test + public void testWordTruncNoCentralDirectory() throws Exception { + // Truncated enough that the zip central directory is missing, + // but [Content_Types].xml and document.xml are intact. + // This exercises the ZipSalvager + OPCPackage fallback path. + List<Metadata> metadataList = + getRecursiveMetadata(truncate("testWORD_various.docx", 13500), true); + assertEquals(1, metadataList.size()); + Metadata metadata = metadataList.get(0); + String content = metadata.get(TikaCoreProperties.TIKA_CONTENT); + assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document", + metadata.get(Metadata.CONTENT_TYPE)); + assertContains("This is the header", content); + assertContains("Suddenly some Japanese", content); + } + @Test public void testTruncation() throws Exception { diff --git a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java index 79d233ba4e..c97c1311df 100644 --- a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java +++ b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java @@ -44,6 +44,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.ParseRecord; +import org.apache.tika.parser.ParsingIntent; import org.apache.tika.parser.RecursiveParserWrapper; import org.apache.tika.pipes.api.FetchEmitTuple; import org.apache.tika.pipes.api.ParseMode; @@ -143,6 +144,9 @@ class ParseHandler { LOG.warn("problem digesting: " + t.getId(), e); } } + // Signal to detectors that parsing will follow, so they can prepare + // resources (e.g., ZipSalvager for truncated zips) + parseContext.set(ParsingIntent.class, ParsingIntent.WILL_PARSE); try { MediaType mt = detector.detect(tis, metadata, parseContext); metadata.set(Metadata.CONTENT_TYPE, mt.toString());
