This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_3x by this push:
new ae486d7176 TIKA-4630 -- improve tracking of internal paths (#2548)
ae486d7176 is described below
commit ae486d71763d84936d1fe3f74c8b8b5f12b0ae7c
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 23 14:55:21 2026 -0500
TIKA-4630 -- improve tracking of internal paths (#2548)
---
.../apache/tika/metadata/TikaCoreProperties.java | 9 ++++-
.../src/test/java/org/apache/tika/TikaTest.java | 4 +-
.../apache/tika/parser/odf/OpenDocumentParser.java | 2 +
.../apache/tika/parser/pkg/CompressorParser.java | 46 ++++++++++++++++------
.../org/apache/tika/parser/pkg/PackageParser.java | 4 +-
.../tika/parser/RecursiveParserWrapperTest.java | 45 +++++++++++++++++++++
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
7 files changed, 95 insertions(+), 17 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index 3d7d34d4ee..348c863f23 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -189,12 +189,19 @@ public interface TikaCoreProperties {
Property.internalTextBag(TIKA_META_PREFIX + "origResourceName");
/**
* This should be used to store the path (relative or full)
- * of the source file, including the file name,
+ * of the source/container file, including the file name,
* e.g. doc/path/to/my_pdf.pdf
* <p>
* This can also be used for a primary key within a database.
*/
Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX +
"sourcePath");
+
+ /**
+ * This records the metadata as stored within a file for an embedded
file's path
+ * including the file name. For example a zip file may include an msg with
this path: /my-emails/important/this.msg
+ */
+ Property INTERNAL_PATH = Property.internalText(TIKA_META_PREFIX +
"internalPath");
+
/**
* This is currently used to identify Content-Type that may be
* included within a document, such as in html documents
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 817f907d94..a039d04e6f 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -490,8 +490,10 @@ public abstract class TikaTest {
RecursiveParserWrapperHandler handler = new
RecursiveParserWrapperHandler(
new
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+ Metadata metadata = new Metadata();
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
FilenameUtils.getName(filePath));
try (InputStream is = getResourceAsStream("/test-documents/" +
filePath)) {
- wrapper.parse(is, handler, new Metadata(), context);
+ wrapper.parse(is, handler, metadata, context);
}
return handler.getMetadataList();
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index 1ed7f76492..78ecd8916f 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -251,6 +251,7 @@ public class OpenDocumentParser implements Parser {
if (embeddedName.contains("Thumbnails/") ||
embeddedName.contains("Pictures/")) {
Metadata embeddedMetadata = new Metadata();
+ embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH,
embeddedName);
TikaInputStream stream = TikaInputStream.get(zip);
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
entry.getName());
@@ -296,6 +297,7 @@ public class OpenDocumentParser implements Parser {
Metadata embeddedMetadata = new Metadata();
embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+ embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, embeddedName);
handler = new OpenDocumentMacroHandler(handler, context);
XMLReaderUtils.parseSAX(CloseShieldInputStream.wrap(is),
new EmbeddedContentHandler(handler), context);
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index c9bfdaca78..aeebd8881e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -49,6 +49,7 @@ import
org.apache.commons.compress.compressors.CompressorStreamFactory;
import
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
import
org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipParameters;
import org.apache.commons.compress.compressors.gzip.GzipUtils;
import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream;
import
org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
@@ -71,6 +72,7 @@ import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
/**
* Parser for various compression formats.
@@ -208,21 +210,12 @@ public class CompressorParser implements Parser {
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
-
try {
Metadata entrydata = new Metadata();
- String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
- if (name != null) {
- if (name.endsWith(".tbz") || name.endsWith(".tbz2")) {
- name = name.substring(0, name.lastIndexOf(".")) + ".tar";
- } else if (name.endsWith(".bz") || name.endsWith(".bz2") ||
name.endsWith(".xz") ||
- name.endsWith(".zlib") || name.endsWith(".pack") ||
name.endsWith(".br")) {
- name = name.substring(0, name.lastIndexOf("."));
- } else if (name.length() > 0) {
- name = GzipUtils.getUncompressedFileName(name);
- }
- entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ if (cis instanceof GzipCompressorInputStream) {
+ extractGzipMetadata((GzipCompressorInputStream) cis,
entrydata);
}
+ setName(metadata, entrydata);
// Use the delegate parser to parse the compressed document
EmbeddedDocumentExtractor extractor =
@@ -237,6 +230,35 @@ public class CompressorParser implements Parser {
xhtml.endDocument();
}
+ private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata
metadata) {
+ GzipParameters gzipParameters = gzcis.getMetaData();
+ if (gzipParameters == null) {
+ return;
+ }
+ String name = gzipParameters.getFileName();
+ if (!StringUtils.isBlank(name)) {
+ metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+ }
+ //TODO: modification, OS, comment
+ }
+
+ private void setName(Metadata parentMetadata, Metadata metadata) {
+ String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+ //if parent's name is blank stop now
+ if (StringUtils.isBlank(name)) {
+ return;
+ }
+ if (name.endsWith(".tgz") || name.endsWith(".tbz") ||
name.endsWith(".tbz2")) {
+ name = name.substring(0, name.lastIndexOf(".")) + ".tar";
+ } else if (name.endsWith(".bz") || name.endsWith("gz") ||
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") ||
name.endsWith(".pack") ||
+ name.endsWith(".br")) {
+ name = name.substring(0, name.lastIndexOf("."));
+ } else if (!name.isEmpty()) {
+ name = GzipUtils.getUncompressedFileName(name);
+ }
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ }
+
/**
* @param metadata
* @return CompressorStream name based on the content-type value
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index bfd2a11682..d7599bf29e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -221,13 +221,13 @@ public class PackageParser extends
AbstractEncodingDetectorParser {
if (name != null && name.length() > 0) {
name = name.replace("\\", "/");
entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+ entrydata.set(TikaCoreProperties.INTERNAL_PATH, name);
+ entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
attributes.addAttribute("", "id", "id", "CDATA", name);
xhtml.startElement("div", attributes);
xhtml.endElement("div");
-
- entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name);
}
return entrydata;
}
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 17b18646a9..69a0eacccb 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -24,9 +24,11 @@ import static org.junit.jupiter.api.Assertions.fail;
import java.io.IOException;
import java.io.InputStream;
+import java.util.Arrays;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+import java.util.stream.Collectors;
import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.ClosedInputStream;
@@ -129,6 +131,49 @@ public class RecursiveParserWrapperTest extends TikaTest {
List<Metadata> list = handler.getMetadataList();
assertEquals(12, list.size());
}
+
+ @Test
+ public void testTarball() throws Exception {
+ List<Metadata> list = getRecursiveMetadata("test-documents.tgz");
+ List<String> actualInternalPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
+ .collect(Collectors.toList());
+
+ List<String> expectedInternalPaths = Arrays.asList(null,
+ "test-documents/testEXCEL.xls",
+ "test-documents/testHTML.html",
+ "Thumbnails/thumbnail.png",
+ "Thumbnails/thumbnail.pdf",
+ "test-documents/testOpenOffice2.odt",
+ "test-documents/testPDF.pdf",
+ "test-documents/testPPT.ppt",
+ "test-documents/testRTF.rtf",
+ "test-documents/testTXT.txt",
+ "test-documents/testWORD.doc",
+ "test-documents/testXML.xml",
+ "test-documents.tar");
+ assertEquals(expectedInternalPaths, actualInternalPaths);
+
+ List<String> actualEmbeddedPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
+ .collect(Collectors.toList());
+ assertEquals(Arrays.asList(null,
+ "/test-documents.tar/testEXCEL.xls",
+ "/test-documents.tar/testHTML.html",
+ "/test-documents.tar/testOpenOffice2.odt/thumbnail.png",
+ "/test-documents.tar/testOpenOffice2.odt/thumbnail.pdf",
+ "/test-documents.tar/testOpenOffice2.odt",
+ "/test-documents.tar/testPDF.pdf",
+ "/test-documents.tar/testPPT.ppt",
+ "/test-documents.tar/testRTF.rtf",
+ "/test-documents.tar/testTXT.txt",
+ "/test-documents.tar/testWORD.doc",
+ "/test-documents.tar/testXML.xml",
+ "/test-documents.tar"), actualEmbeddedPaths);
+ }
+
@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 57feed96d6..baf0498c3c 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -364,7 +364,7 @@ public class PDFParserTest extends TikaTest {
assertEquals("91", metadatas.get(1).get("height"));
assertEquals("352", metadatas.get(1).get("width"));
- assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+ assertEquals("testPDF_JBIG2.pdf",
metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals("image0.jb2",
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
assertEquals(MediaType.image("x-jbig2").toString(),
metadatas.get(1).get(Metadata.CONTENT_TYPE));