This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new cff5a735d8 TIKA-4705 -- resourceName of nested tarball should not
contain the parent directories of its parent gzip file, plus fixing typo where
'.' was missing from gz extension (#2750)
cff5a735d8 is described below
commit cff5a735d849d3f05f8a411f9502b36b372361f7
Author: iachimoe <[email protected]>
AuthorDate: Wed Apr 8 15:03:03 2026 +0100
TIKA-4705 -- resourceName of nested tarball should not contain the parent
directories of its parent gzip file, plus fixing typo where '.' was missing
from gz extension (#2750)
---
.../apache/tika/parser/pkg/CompressorParser.java | 6 +++-
.../test-documents/test-nested-tarball.tar | Bin 0 -> 3072 bytes
.../tika/parser/RecursiveParserWrapperTest.java | 37 ++++++++++++++++++++-
3 files changed, 41 insertions(+), 2 deletions(-)
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
index 5b9edc1c70..8535d304b8 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java
@@ -65,6 +65,7 @@ import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
+import org.apache.tika.io.FilenameUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
@@ -299,9 +300,12 @@ public class CompressorParser implements Parser {
if (StringUtils.isBlank(name)) {
return;
}
+
+ name = FilenameUtils.getName(name);
+
if (name.endsWith(".tgz") || name.endsWith(".tbz") ||
name.endsWith(".tbz2")) {
name = name.substring(0, name.lastIndexOf(".")) + ".tar";
- } else if (name.endsWith(".bz") || name.endsWith("gz") ||
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") ||
name.endsWith(".pack") ||
+ } else if (name.endsWith(".bz") || name.endsWith(".gz") ||
name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") ||
name.endsWith(".pack") ||
name.endsWith(".br")) {
name = name.substring(0, name.lastIndexOf("."));
} else if (!name.isEmpty()) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
new file mode 100644
index 0000000000..d5c6a88746
Binary files /dev/null and
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/resources/test-documents/test-nested-tarball.tar
differ
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
index 4d4489f871..62c55d617e 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java
@@ -174,6 +174,41 @@ public class RecursiveParserWrapperTest extends TikaTest {
"/test-documents.tar"), actualEmbeddedPaths);
}
+ @Test
+ public void testNestedTarball() throws Exception {
+ List<Metadata> list = getRecursiveMetadata("test-nested-tarball.tar");
+ List<String> actualResourceNames =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.RESOURCE_NAME_KEY))
+ .collect(Collectors.toList());
+
+ List<String> expectedResourceNames =
Arrays.asList("test-nested-tarball.tar",
+ "folderWithinTgz/testTXT.txt",
+ "nested.tar",
+ "folderContainingTgz/inner/nested.tgz");
+ assertEquals(expectedResourceNames, actualResourceNames);
+
+ List<String> actualInternalPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH))
+ .collect(Collectors.toList());
+
+ List<String> expectedInternalPaths = Arrays.asList(null,
+ "folderWithinTgz/testTXT.txt",
+ null, // tar file within a gz doesn't have an internal path
+ "folderContainingTgz/inner/nested.tgz");
+ assertEquals(expectedInternalPaths, actualInternalPaths);
+
+ List<String> actualEmbeddedPaths =
+ list.stream()
+ .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH))
+ .collect(Collectors.toList());
+ assertEquals(Arrays.asList(null,
+ "/nested.tgz/nested.tar/testTXT.txt",
+ "/nested.tgz/nested.tar",
+ "/nested.tgz"), actualEmbeddedPaths);
+ }
+
@Test
public void testCharLimitNoThrowOnWriteLimit() throws Exception {
ParseContext context = new ParseContext();
@@ -445,7 +480,7 @@ public class RecursiveParserWrapperTest extends TikaTest {
}
}
-
+
private List<Metadata> getMetadata(Metadata metadata,
ContentHandlerFactory
contentHandlerFactory,
boolean catchEmbeddedExceptions,