This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4459 in repository https://gitbox.apache.org/repos/asf/tika.git
commit e517bcb362acf580ab79c894f715cc102c215fee Author: tallison <[email protected]> AuthorDate: Wed Jul 30 14:52:05 2025 -0400 TIKA-4459 -- force stream to zip file to handle encrypted od* documents correctly --- .../apache/tika/parser/odf/OpenDocumentParser.java | 65 +++++----------------- .../org/apache/tika/parser/odf/ODFParserTest.java | 7 +++ 2 files changed, 20 insertions(+), 52 deletions(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java index 222819a6f..1ed7f7649 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -20,16 +20,13 @@ import static java.nio.charset.StandardCharsets.UTF_8; import java.io.IOException; import java.io.InputStream; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Enumeration; import java.util.HashSet; -import java.util.List; import java.util.Set; import java.util.zip.ZipEntry; import java.util.zip.ZipFile; -import java.util.zip.ZipInputStream; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.CloseShieldInputStream; @@ -40,7 +37,6 @@ import org.xml.sax.helpers.DefaultHandler; import org.apache.tika.config.Field; import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.exception.TikaException; -import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; @@ -134,21 +130,21 @@ public class OpenDocumentParser implements Parser { // Open the Zip stream // Use a File if we can, and an already open zip is even better ZipFile zipFile = null; - ZipInputStream zipStream = null; + TikaInputStream tmpTis = null; if (stream instanceof TikaInputStream) { TikaInputStream tis = (TikaInputStream) stream; Object container = ((TikaInputStream) stream).getOpenContainer(); if (container instanceof ZipFile) { zipFile = (ZipFile) container; - } else if (tis.hasFile()) { - zipFile = new ZipFile(tis.getFile()); } else { - zipStream = new ZipInputStream(stream); + zipFile = new ZipFile(tis.getFile()); + tis.setOpenContainer(zipFile); } } else { - zipStream = new ZipInputStream(stream); + tmpTis = TikaInputStream.get(stream); + tmpTis.setOpenContainer(new ZipFile(tmpTis.getFile())); + zipFile = (ZipFile) tmpTis.getOpenContainer(); } - // Prepare to handle the content XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler, metadata); xhtml.startDocument(); @@ -157,19 +153,13 @@ public class OpenDocumentParser implements Parser { EndDocumentShieldingContentHandler handler = new EndDocumentShieldingContentHandler(xhtml); try { - if (zipFile != null) { - try { - handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil); - } finally { - //Do we want to close silently == catch an exception here? - zipFile.close(); - } - } else { - try { - handleZipStream(zipStream, metadata, context, handler, embeddedDocumentUtil); - } finally { - //Do we want to close silently == catch an exception here? - zipStream.close(); + try { + handleZipFile(zipFile, metadata, context, handler, embeddedDocumentUtil); + } finally { + //Do we want to close silently == catch an exception here? + if (tmpTis != null) { + //tmpTis handles closing of the open zip container + tmpTis.close(); } } } catch (SAXException e) { @@ -194,35 +184,6 @@ public class OpenDocumentParser implements Parser { return extractMacros; } - private void handleZipStream(ZipInputStream zipStream, Metadata metadata, ParseContext context, - EndDocumentShieldingContentHandler handler, - EmbeddedDocumentUtil embeddedDocumentUtil) - throws IOException, TikaException, SAXException { - ZipEntry entry = zipStream.getNextEntry(); - if (entry == null) { - throw new IOException("No entries found in ZipInputStream"); - } - List<SAXException> exceptions = new ArrayList<>(); - do { - try { - handleZipEntry(entry, zipStream, metadata, context, handler, - embeddedDocumentUtil); - } catch (SAXException e) { - WriteLimitReachedException.throwIfWriteLimitReached(e); - if (e.getCause() instanceof EncryptedDocumentException) { - throw (EncryptedDocumentException)e.getCause(); - } else { - exceptions.add(e); - } - } - entry = zipStream.getNextEntry(); - } while (entry != null); - - if (exceptions.size() > 0) { - throw exceptions.get(0); - } - } - private void handleZipFile(ZipFile zipFile, Metadata metadata, ParseContext context, EndDocumentShieldingContentHandler handler, EmbeddedDocumentUtil embeddedDocumentUtil) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java index da7028a91..72d7d683a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java @@ -25,6 +25,7 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.io.InputStream; import java.nio.charset.StandardCharsets; +import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.Paths; import java.util.Arrays; @@ -415,6 +416,12 @@ public class ODFParserTest extends TikaTest { getRecursiveMetadata(p, false); }); + assertThrows(EncryptedDocumentException.class, () -> { + try (InputStream is = Files.newInputStream(p)) { + getRecursiveMetadata(is, false); + } + }); + List<Metadata> metadataList = getRecursiveMetadata(p, true); assertEquals("true", metadataList.get(0).get(TikaCoreProperties.IS_ENCRYPTED)); }
