This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4630-on-main
in repository https://gitbox.apache.org/repos/asf/tika.git

commit d7781ecf333f6ed9eba3327a2a60275bf28d3de9
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 23 14:55:21 2026 -0500

    TIKA-4630 -- improve tracking of internal paths (#2548)
---
 .../apache/tika/metadata/TikaCoreProperties.java   |  9 ++++-
 .../src/test/java/org/apache/tika/TikaTest.java    |  4 +-
 .../apache/tika/parser/odf/OpenDocumentParser.java |  2 +
 .../apache/tika/parser/pkg/CompressorParser.java   | 46 ++++++++++++++++------
 .../org/apache/tika/parser/pkg/PackageParser.java  |  3 +-
 .../tika/parser/RecursiveParserWrapperTest.java    | 45 +++++++++++++++++++++
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |  2 +-
 7 files changed, 94 insertions(+), 17 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java 
b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index b89323fc11..cc712543b5 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -204,12 +204,19 @@ public interface TikaCoreProperties {
             Property.internalTextBag(TIKA_META_PREFIX + "origResourceName");
     /**
      * This should be used to store the path (relative or full)
-     * of the source file, including the file name,
+     * of the source/container file, including the file name,
      * e.g. doc/path/to/my_pdf.pdf
      * <p>
      * This can also be used for a primary key within a database.
      */
     Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + 
"sourcePath");
+
+    /**
+     * This records the metadata as stored within a file for an embedded 
file's path
+     * including the file name. For example a zip file may include an msg with 
this path: /my-emails/important/this.msg
+     */
+    Property INTERNAL_PATH = Property.internalText(TIKA_META_PREFIX + 
"internalPath");
+
     /**
      * This is currently used to identify Content-Type that may be
      * included within a document, such as in html documents
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java 
b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index 592447e2e0..a8e37a85b2 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -496,8 +496,10 @@ public abstract class TikaTest {
 
         RecursiveParserWrapperHandler handler = new 
RecursiveParserWrapperHandler(
                 new 
BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1));
+        Metadata metadata = new Metadata();
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
FilenameUtils.getName(filePath));
         try (TikaInputStream tis = getResourceAsStream("/test-documents/" + 
filePath)) {
-            wrapper.parse(tis, handler, new Metadata(), context);
+            wrapper.parse(tis, handler, metadata, context);
         }
         return handler.getMetadataList();
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
index e6f192ee0f..ed1ca792c8 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java
@@ -266,6 +266,7 @@ public class OpenDocumentParser implements Parser {
             if (embeddedName.contains("Thumbnails/") || 
embeddedName.contains("Pictures/")) {
 
                 Metadata embeddedMetadata = new Metadata();
+                embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, 
embeddedName);
 
                     embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
entry.getName());
                     if (embeddedName.startsWith("Thumbnails/")) {
@@ -310,6 +311,7 @@ public class OpenDocumentParser implements Parser {
         Metadata embeddedMetadata = new Metadata();
         embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.MACRO.toString());
+        embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, embeddedName);
         handler = new OpenDocumentMacroHandler(handler, context);
         try {
             tisZip.setCloseShield();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 03be853bb5..e36be1c397 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -47,6 +47,7 @@ import 
org.apache.commons.compress.compressors.CompressorStreamFactory;
 import 
org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream;
 import 
org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream;
 import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
+import org.apache.commons.compress.compressors.gzip.GzipParameters;
 import org.apache.commons.compress.compressors.gzip.GzipUtils;
 import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream;
 import 
org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream;
@@ -71,6 +72,7 @@ import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.XHTMLContentHandler;
+import org.apache.tika.utils.StringUtils;
 
 /**
  * Parser for various compression formats.
@@ -236,21 +238,12 @@ public class CompressorParser implements Parser {
 
         XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
         xhtml.startDocument();
-
         try {
             Metadata entrydata = new Metadata();
-            String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
-            if (name != null) {
-                if (name.endsWith(".tbz") || name.endsWith(".tbz2")) {
-                    name = name.substring(0, name.lastIndexOf(".")) + ".tar";
-                } else if (name.endsWith(".bz") || name.endsWith(".bz2") || 
name.endsWith(".xz") ||
-                        name.endsWith(".zlib") || name.endsWith(".pack") || 
name.endsWith(".br")) {
-                    name = name.substring(0, name.lastIndexOf("."));
-                } else if (name.length() > 0) {
-                    name = GzipUtils.getUncompressedFileName(name);
-                }
-                entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            if (cis instanceof GzipCompressorInputStream) {
+                extractGzipMetadata((GzipCompressorInputStream) cis, 
entrydata);
             }
+            setName(metadata, entrydata);
 
             // Use the delegate parser to parse the compressed document
             EmbeddedDocumentExtractor extractor =
@@ -268,6 +261,35 @@ public class CompressorParser implements Parser {
         xhtml.endDocument();
     }
 
+    private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata 
metadata) {
+        GzipParameters gzipParameters = gzcis.getMetaData();
+        if (gzipParameters == null) {
+            return;
+        }
+        String name = gzipParameters.getFileName();
+        if (!StringUtils.isBlank(name)) {
+            metadata.set(TikaCoreProperties.INTERNAL_PATH, name);
+        }
+        //TODO: modification, OS, comment
+    }
+
+    private void setName(Metadata parentMetadata, Metadata metadata) {
+        String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY);
+        //if parent's name is blank stop now
+        if (StringUtils.isBlank(name)) {
+            return;
+        }
+        if (name.endsWith(".tgz") || name.endsWith(".tbz") || 
name.endsWith(".tbz2")) {
+            name = name.substring(0, name.lastIndexOf(".")) + ".tar";
+        } else if (name.endsWith(".bz") || name.endsWith("gz") || 
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || 
name.endsWith(".pack") ||
+                name.endsWith(".br")) {
+            name = name.substring(0, name.lastIndexOf("."));
+        } else if (!name.isEmpty()) {
+            name = GzipUtils.getUncompressedFileName(name);
+        }
+        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+    }
+
     /**
      * @param metadata
      * @return CompressorStream name based on the content-type value
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
index 5b8aecbc0f..26970002be 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java
@@ -217,13 +217,12 @@ public class PackageParser extends 
AbstractEncodingDetectorParser {
         if (name != null && name.length() > 0) {
             name = name.replace("\\", "/");
             entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name);
+            entrydata.set(TikaCoreProperties.INTERNAL_PATH, name);
             AttributesImpl attributes = new AttributesImpl();
             attributes.addAttribute("", "class", "class", "CDATA", "embedded");
             attributes.addAttribute("", "id", "id", "CDATA", name);
             xhtml.startElement("div", attributes);
             xhtml.endElement("div");
-
-            entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name);
         }
         return entrydata;
     }
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 5fada5528d..30dab1331e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -23,9 +23,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue;
 
 import java.io.IOException;
 import java.io.InputStream;
+import java.util.Arrays;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+import java.util.stream.Collectors;
 
 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.ClosedInputStream;
@@ -127,6 +129,49 @@ public class RecursiveParserWrapperTest extends TikaTest {
         List<Metadata> list = handler.getMetadataList();
         assertEquals(12, list.size());
     }
+
+    @Test
+    public void testTarball() throws Exception {
+        List<Metadata> list = getRecursiveMetadata("test-documents.tgz");
+        List<String> actualInternalPaths =
+                list.stream()
+                        .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
+                        .collect(Collectors.toList());
+
+        List<String> expectedInternalPaths = Arrays.asList(null,
+                "test-documents/testEXCEL.xls",
+                "test-documents/testHTML.html",
+                "Thumbnails/thumbnail.png",
+                "Thumbnails/thumbnail.pdf",
+                "test-documents/testOpenOffice2.odt",
+                "test-documents/testPDF.pdf",
+                "test-documents/testPPT.ppt",
+                "test-documents/testRTF.rtf",
+                "test-documents/testTXT.txt",
+                "test-documents/testWORD.doc",
+                "test-documents/testXML.xml",
+                "test-documents.tar");
+        assertEquals(expectedInternalPaths, actualInternalPaths);
+
+        List<String> actualEmbeddedPaths =
+                list.stream()
+                    .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
+                    .collect(Collectors.toList());
+        assertEquals(Arrays.asList(null,
+                "/test-documents.tar/testEXCEL.xls",
+                "/test-documents.tar/testHTML.html",
+                "/test-documents.tar/testOpenOffice2.odt/thumbnail.png",
+                "/test-documents.tar/testOpenOffice2.odt/thumbnail.pdf",
+                "/test-documents.tar/testOpenOffice2.odt",
+                "/test-documents.tar/testPDF.pdf",
+                "/test-documents.tar/testPPT.ppt",
+                "/test-documents.tar/testRTF.rtf",
+                "/test-documents.tar/testTXT.txt",
+                "/test-documents.tar/testWORD.doc",
+                "/test-documents.tar/testXML.xml",
+                "/test-documents.tar"), actualEmbeddedPaths);
+    }
+
     @Test
     public void testCharLimitNoThrowOnWriteLimit() throws Exception {
         ParseContext context = new ParseContext();
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index cda95cc56b..11797a499e 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -368,7 +368,7 @@ public class PDFParserTest extends TikaTest {
         assertEquals("91", 
metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "height"));
         assertEquals("352", 
metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "width"));
 
-        assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
+        assertEquals("testPDF_JBIG2.pdf", 
metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY));
         assertEquals("image0.jb2", 
metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY));
         assertEquals(MediaType.image("x-jbig2").toString(),
                 metadatas.get(1).get(Metadata.CONTENT_TYPE));

Reply via email to