This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_3x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 259043ced9347eff7f19901d70f39e643393eb49
Author: iachimoe <[email protected]>
AuthorDate: Wed Apr 8 10:03:03 2026 -0400

    TIKA-4705 -- resourceName of nested tarball should not contain the parent 
directories of its parent gzip file, plus fixing typo where '.' was missing 
from gz extension (#2750)
    
    (cherry picked from commit cff5a735d849d3f05f8a411f9502b36b372361f7)
---
 .../apache/tika/parser/pkg/CompressorParser.java   |   7 ++++-
 .../test-documents/test-nested-tarball.tar         | Bin 0 -> 3072 bytes
 .../tika/parser/RecursiveParserWrapperTest.java    |  35 +++++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)

diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 955880d8a6..c19de9722a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -66,6 +66,8 @@ import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.extractor.EmbeddedDocumentExtractor;
 import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.FilenameUtils;
+import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
@@ -254,9 +256,12 @@ public class CompressorParser implements Parser {
         if (StringUtils.isBlank(name)) {
             return;
         }
+
+        name = FilenameUtils.getName(name);
+
         if (name.endsWith(".tgz") || name.endsWith(".tbz") || 
name.endsWith(".tbz2")) {
             name = name.substring(0, name.lastIndexOf(".")) + ".tar";
-        } else if (name.endsWith(".bz") || name.endsWith("gz") || 
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || 
name.endsWith(".pack") ||
+        } else if (name.endsWith(".bz") || name.endsWith(".gz") || 
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || 
name.endsWith(".pack") ||
                 name.endsWith(".br")) {
             name = name.substring(0, name.lastIndexOf("."));
         } else if (!name.isEmpty()) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
new file mode 100644
index 0000000000..d5c6a88746
Binary files /dev/null and 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
 differ
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 69a0eacccb..4261db2990 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -174,6 +174,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
                 "/test-documents.tar"), actualEmbeddedPaths);
     }
 
+    @Test
+    public void testNestedTarball() throws Exception {
+        List<Metadata> list = getRecursiveMetadata("test-nested-tarball.tar");
+        List<String> actualResourceNames =
+            list.stream()
+                .map(m -> m.get(TikaCoreProperties.RESOURCE_NAME_KEY))
+                .collect(Collectors.toList());
+
+        List<String> expectedResourceNames = 
Arrays.asList("test-nested-tarball.tar",
+            "folderWithinTgz/testTXT.txt",
+            "nested.tar",
+            "folderContainingTgz/inner/nested.tgz");
+        assertEquals(expectedResourceNames, actualResourceNames);
+
+        List<String> actualInternalPaths =
+            list.stream()
+                .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
+                .collect(Collectors.toList());
+
+        List<String> expectedInternalPaths = Arrays.asList(null,
+            "folderWithinTgz/testTXT.txt",
+            null, // tar file within a gz doesn't have an internal path
+            "folderContainingTgz/inner/nested.tgz");
+        assertEquals(expectedInternalPaths, actualInternalPaths);
+
+        List<String> actualEmbeddedPaths =
+            list.stream()
+                .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
+                .collect(Collectors.toList());
+        assertEquals(Arrays.asList(null,
+            "/nested.tgz/nested.tar/testTXT.txt",
+            "/nested.tgz/nested.tar",
+            "/nested.tgz"), actualEmbeddedPaths);
+    }
+
     @Test
     public void testCharLimitNoThrowOnWriteLimit() throws Exception {
         ParseContext context = new ParseContext();

Reply via email to