This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch branch_3x in repository https://gitbox.apache.org/repos/asf/tika.git
commit 259043ced9347eff7f19901d70f39e643393eb49 Author: iachimoe <[email protected]> AuthorDate: Wed Apr 8 10:03:03 2026 -0400 TIKA-4705 -- resourceName of nested tarball should not contain the parent directories of its parent gzip file, plus fixing typo where '.' was missing from gz extension (#2750) (cherry picked from commit cff5a735d849d3f05f8a411f9502b36b372361f7) --- .../apache/tika/parser/pkg/CompressorParser.java | 7 ++++- .../test-documents/test-nested-tarball.tar | Bin 0 -> 3072 bytes .../tika/parser/RecursiveParserWrapperTest.java | 35 +++++++++++++++++++++ 3 files changed, 41 insertions(+), 1 deletion(-) diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 955880d8a6..c19de9722a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -66,6 +66,8 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.TikaMemoryLimitException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.FilenameUtils; +import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; @@ -254,9 +256,12 @@ public class CompressorParser implements Parser { if (StringUtils.isBlank(name)) { return; } + + name = FilenameUtils.getName(name); + if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) { name = name.substring(0, name.lastIndexOf(".")) + ".tar"; - } else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") || + } else if (name.endsWith(".bz") || name.endsWith(".gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") || name.endsWith(".br")) { name = name.substring(0, name.lastIndexOf(".")); } else if (!name.isEmpty()) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar new file mode 100644 index 0000000000..d5c6a88746 Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar differ diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 69a0eacccb..4261db2990 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -174,6 +174,41 @@ public class RecursiveParserWrapperTest extends TikaTest { "/test-documents.tar"), actualEmbeddedPaths); } + @Test + public void testNestedTarball() throws Exception { + List<Metadata> list = getRecursiveMetadata("test-nested-tarball.tar"); + List<String> actualResourceNames = + list.stream() + .map(m -> m.get(TikaCoreProperties.RESOURCE_NAME_KEY)) + .collect(Collectors.toList()); + + List<String> expectedResourceNames = Arrays.asList("test-nested-tarball.tar", + "folderWithinTgz/testTXT.txt", + "nested.tar", + "folderContainingTgz/inner/nested.tgz"); + assertEquals(expectedResourceNames, actualResourceNames); + + List<String> actualInternalPaths = + list.stream() + .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH)) + .collect(Collectors.toList()); + + List<String> expectedInternalPaths = Arrays.asList(null, + "folderWithinTgz/testTXT.txt", + null, // tar file within a gz doesn't have an internal path + "folderContainingTgz/inner/nested.tgz"); + assertEquals(expectedInternalPaths, actualInternalPaths); + + List<String> actualEmbeddedPaths = + list.stream() + .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)) + .collect(Collectors.toList()); + assertEquals(Arrays.asList(null, + "/nested.tgz/nested.tar/testTXT.txt", + "/nested.tgz/nested.tar", + "/nested.tgz"), actualEmbeddedPaths); + } + @Test public void testCharLimitNoThrowOnWriteLimit() throws Exception { ParseContext context = new ParseContext();
