This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e5154cb2d TIKA-4459 -- force stream to zip file to handle encrypted
od* documents correctly (#2291)
e5154cb2d is described below
commit e5154cb2d87a0697ccb79014d1a6cde38b25a596
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jul 31 17:37:30 2025 -0400
TIKA-4459 -- force stream to zip file to handle encrypted od* documents
correctly (#2291)
---
.../apache/tika/parser/odf/OpenDocumentParser.java | 65 +++++-----------------
.../org/apache/tika/parser/odf/ODFParserTest.java | 7 +++
2 files changed, 20 insertions(+), 52 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 222819a6f..1ed7f7649 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -20,16 +20,13 @@ import static java.nio.charset.StandardCharsets.UTF_8;
import java.io.IOException;
import java.io.InputStream;
-import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Enumeration;
import java.util.HashSet;
-import java.util.List;
import java.util.Set;
import java.util.zip.ZipEntry;
import java.util.zip.ZipFile;
-import java.util.zip.ZipInputStream;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
@@ -40,7 +37,6 @@ import org.xml.sax.helpers.DefaultHandler;
import org.apache.tika.config.Field;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.exception.WriteLimitReachedException;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
@@ -134,21 +130,21 @@ public class OpenDocumentParser implements Parser {
// Open the Zip stream
// Use a File if we can, and an already open zip is even better
ZipFile zipFile = null;
- ZipInputStream zipStream = null;
+ TikaInputStream tmpTis = null;
if (stream instanceof TikaInputStream) {
TikaInputStream tis = (TikaInputStream) stream;
Object container = ((TikaInputStream) stream).getOpenContainer();
if (container instanceof ZipFile) {
zipFile = (ZipFile) container;
- } else if (tis.hasFile()) {
- zipFile = new ZipFile(tis.getFile());
} else {
- zipStream = new ZipInputStream(stream);
+ zipFile = new ZipFile(tis.getFile());
+ tis.setOpenContainer(zipFile);
}
} else {
- zipStream = new ZipInputStream(stream);
+ tmpTis = TikaInputStream.get(stream);
+ tmpTis.setOpenContainer(new ZipFile(tmpTis.getFile()));
+ zipFile = (ZipFile) tmpTis.getOpenContainer();
}
-
// Prepare to handle the content
XHTMLContentHandler xhtml = new XHTMLContentHandler(baseHandler,
metadata);
xhtml.startDocument();
@@ -157,19 +153,13 @@ public class OpenDocumentParser implements Parser {
EndDocumentShieldingContentHandler handler = new
EndDocumentShieldingContentHandler(xhtml);
try {
- if (zipFile != null) {
- try {
- handleZipFile(zipFile, metadata, context, handler,
embeddedDocumentUtil);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipFile.close();
- }
- } else {
- try {
- handleZipStream(zipStream, metadata, context, handler,
embeddedDocumentUtil);
- } finally {
- //Do we want to close silently == catch an exception here?
- zipStream.close();
+ try {
+ handleZipFile(zipFile, metadata, context, handler,
embeddedDocumentUtil);
+ } finally {
+ //Do we want to close silently == catch an exception here?
+ if (tmpTis != null) {
+ //tmpTis handles closing of the open zip container
+ tmpTis.close();
}
}
} catch (SAXException e) {
@@ -194,35 +184,6 @@ public class OpenDocumentParser implements Parser {
return extractMacros;
}
- private void handleZipStream(ZipInputStream zipStream, Metadata metadata,
ParseContext context,
- EndDocumentShieldingContentHandler handler,
- EmbeddedDocumentUtil embeddedDocumentUtil)
- throws IOException, TikaException, SAXException {
- ZipEntry entry = zipStream.getNextEntry();
- if (entry == null) {
- throw new IOException("No entries found in ZipInputStream");
- }
- List<SAXException> exceptions = new ArrayList<>();
- do {
- try {
- handleZipEntry(entry, zipStream, metadata, context, handler,
- embeddedDocumentUtil);
- } catch (SAXException e) {
- WriteLimitReachedException.throwIfWriteLimitReached(e);
- if (e.getCause() instanceof EncryptedDocumentException) {
- throw (EncryptedDocumentException)e.getCause();
- } else {
- exceptions.add(e);
- }
- }
- entry = zipStream.getNextEntry();
- } while (entry != null);
-
- if (exceptions.size() > 0) {
- throw exceptions.get(0);
- }
- }
-
private void handleZipFile(ZipFile zipFile, Metadata metadata,
ParseContext context,
EndDocumentShieldingContentHandler handler,
EmbeddedDocumentUtil embeddedDocumentUtil)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
index da7028a91..72d7d683a 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/test/java/org/apache/tika/parser/odf/ODFParserTest.java
@@ -25,6 +25,7 @@ import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Arrays;
@@ -415,6 +416,12 @@ public class ODFParserTest extends TikaTest {
getRecursiveMetadata(p, false);
});
+ assertThrows(EncryptedDocumentException.class, () -> {
+ try (InputStream is = Files.newInputStream(p)) {
+ getRecursiveMetadata(is, false);
+ }
+ });
+
List<Metadata> metadataList = getRecursiveMetadata(p, true);
assertEquals("true",
metadataList.get(0).get(TikaCoreProperties.IS_ENCRYPTED));
}