This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4630-on-main in repository https://gitbox.apache.org/repos/asf/tika.git
commit d7781ecf333f6ed9eba3327a2a60275bf28d3de9 Author: Tim Allison <[email protected]> AuthorDate: Fri Jan 23 14:55:21 2026 -0500 TIKA-4630 -- improve tracking of internal paths (#2548) --- .../apache/tika/metadata/TikaCoreProperties.java | 9 ++++- .../src/test/java/org/apache/tika/TikaTest.java | 4 +- .../apache/tika/parser/odf/OpenDocumentParser.java | 2 + .../apache/tika/parser/pkg/CompressorParser.java | 46 ++++++++++++++++------ .../org/apache/tika/parser/pkg/PackageParser.java | 3 +- .../tika/parser/RecursiveParserWrapperTest.java | 45 +++++++++++++++++++++ .../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +- 7 files changed, 94 insertions(+), 17 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java index b89323fc11..cc712543b5 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java @@ -204,12 +204,19 @@ public interface TikaCoreProperties { Property.internalTextBag(TIKA_META_PREFIX + "origResourceName"); /** * This should be used to store the path (relative or full) - * of the source file, including the file name, + * of the source/container file, including the file name, * e.g. doc/path/to/my_pdf.pdf * <p> * This can also be used for a primary key within a database. */ Property SOURCE_PATH = Property.internalText(TIKA_META_PREFIX + "sourcePath"); + + /** + * This records the metadata as stored within a file for an embedded file's path + * including the file name. For example a zip file may include an msg with this path: /my-emails/important/this.msg + */ + Property INTERNAL_PATH = Property.internalText(TIKA_META_PREFIX + "internalPath"); + /** * This is currently used to identify Content-Type that may be * included within a document, such as in html documents diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java index 592447e2e0..a8e37a85b2 100644 --- a/tika-core/src/test/java/org/apache/tika/TikaTest.java +++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java @@ -496,8 +496,10 @@ public abstract class TikaTest { RecursiveParserWrapperHandler handler = new RecursiveParserWrapperHandler( new BasicContentHandlerFactory(BasicContentHandlerFactory.HANDLER_TYPE.XML, -1)); + Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, FilenameUtils.getName(filePath)); try (TikaInputStream tis = getResourceAsStream("/test-documents/" + filePath)) { - wrapper.parse(tis, handler, new Metadata(), context); + wrapper.parse(tis, handler, metadata, context); } return handler.getMetadataList(); } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java index e6f192ee0f..ed1ca792c8 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/odf/OpenDocumentParser.java @@ -266,6 +266,7 @@ public class OpenDocumentParser implements Parser { if (embeddedName.contains("Thumbnails/") || embeddedName.contains("Pictures/")) { Metadata embeddedMetadata = new Metadata(); + embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, embeddedName); embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, entry.getName()); if (embeddedName.startsWith("Thumbnails/")) { @@ -310,6 +311,7 @@ public class OpenDocumentParser implements Parser { Metadata embeddedMetadata = new Metadata(); embeddedMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.MACRO.toString()); + embeddedMetadata.set(TikaCoreProperties.INTERNAL_PATH, embeddedName); handler = new OpenDocumentMacroHandler(handler, context); try { tisZip.setCloseShield(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java index 03be853bb5..e36be1c397 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/CompressorParser.java @@ -47,6 +47,7 @@ import org.apache.commons.compress.compressors.CompressorStreamFactory; import org.apache.commons.compress.compressors.bzip2.BZip2CompressorInputStream; import org.apache.commons.compress.compressors.deflate.DeflateCompressorInputStream; import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; +import org.apache.commons.compress.compressors.gzip.GzipParameters; import org.apache.commons.compress.compressors.gzip.GzipUtils; import org.apache.commons.compress.compressors.lzma.LZMACompressorInputStream; import org.apache.commons.compress.compressors.pack200.Pack200CompressorInputStream; @@ -71,6 +72,7 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.StringUtils; /** * Parser for various compression formats. @@ -236,21 +238,12 @@ public class CompressorParser implements Parser { XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata); xhtml.startDocument(); - try { Metadata entrydata = new Metadata(); - String name = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); - if (name != null) { - if (name.endsWith(".tbz") || name.endsWith(".tbz2")) { - name = name.substring(0, name.lastIndexOf(".")) + ".tar"; - } else if (name.endsWith(".bz") || name.endsWith(".bz2") || name.endsWith(".xz") || - name.endsWith(".zlib") || name.endsWith(".pack") || name.endsWith(".br")) { - name = name.substring(0, name.lastIndexOf(".")); - } else if (name.length() > 0) { - name = GzipUtils.getUncompressedFileName(name); - } - entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + if (cis instanceof GzipCompressorInputStream) { + extractGzipMetadata((GzipCompressorInputStream) cis, entrydata); } + setName(metadata, entrydata); // Use the delegate parser to parse the compressed document EmbeddedDocumentExtractor extractor = @@ -268,6 +261,35 @@ public class CompressorParser implements Parser { xhtml.endDocument(); } + private void extractGzipMetadata(GzipCompressorInputStream gzcis, Metadata metadata) { + GzipParameters gzipParameters = gzcis.getMetaData(); + if (gzipParameters == null) { + return; + } + String name = gzipParameters.getFileName(); + if (!StringUtils.isBlank(name)) { + metadata.set(TikaCoreProperties.INTERNAL_PATH, name); + } + //TODO: modification, OS, comment + } + + private void setName(Metadata parentMetadata, Metadata metadata) { + String name = parentMetadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + //if parent's name is blank stop now + if (StringUtils.isBlank(name)) { + return; + } + if (name.endsWith(".tgz") || name.endsWith(".tbz") || name.endsWith(".tbz2")) { + name = name.substring(0, name.lastIndexOf(".")) + ".tar"; + } else if (name.endsWith(".bz") || name.endsWith("gz") || name.endsWith(".bz2") || name.endsWith(".xz") || name.endsWith(".zlib") || name.endsWith(".pack") || + name.endsWith(".br")) { + name = name.substring(0, name.lastIndexOf(".")); + } else if (!name.isEmpty()) { + name = GzipUtils.getUncompressedFileName(name); + } + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + } + /** * @param metadata * @return CompressorStream name based on the content-type value diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java index 5b8aecbc0f..26970002be 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/main/java/org/apache/tika/parser/pkg/PackageParser.java @@ -217,13 +217,12 @@ public class PackageParser extends AbstractEncodingDetectorParser { if (name != null && name.length() > 0) { name = name.replace("\\", "/"); entrydata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); + entrydata.set(TikaCoreProperties.INTERNAL_PATH, name); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute("", "class", "class", "CDATA", "embedded"); attributes.addAttribute("", "id", "id", "CDATA", name); xhtml.startElement("div", attributes); xhtml.endElement("div"); - - entrydata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, name); } return entrydata; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java index 5fada5528d..30dab1331e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/RecursiveParserWrapperTest.java @@ -23,9 +23,11 @@ import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.IOException; import java.io.InputStream; +import java.util.Arrays; import java.util.HashSet; import java.util.List; import java.util.Set; +import java.util.stream.Collectors; import org.apache.commons.io.IOUtils; import org.apache.commons.io.input.ClosedInputStream; @@ -127,6 +129,49 @@ public class RecursiveParserWrapperTest extends TikaTest { List<Metadata> list = handler.getMetadataList(); assertEquals(12, list.size()); } + + @Test + public void testTarball() throws Exception { + List<Metadata> list = getRecursiveMetadata("test-documents.tgz"); + List<String> actualInternalPaths = + list.stream() + .map(m -> m.get(TikaCoreProperties.INTERNAL_PATH)) + .collect(Collectors.toList()); + + List<String> expectedInternalPaths = Arrays.asList(null, + "test-documents/testEXCEL.xls", + "test-documents/testHTML.html", + "Thumbnails/thumbnail.png", + "Thumbnails/thumbnail.pdf", + "test-documents/testOpenOffice2.odt", + "test-documents/testPDF.pdf", + "test-documents/testPPT.ppt", + "test-documents/testRTF.rtf", + "test-documents/testTXT.txt", + "test-documents/testWORD.doc", + "test-documents/testXML.xml", + "test-documents.tar"); + assertEquals(expectedInternalPaths, actualInternalPaths); + + List<String> actualEmbeddedPaths = + list.stream() + .map(m -> m.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH)) + .collect(Collectors.toList()); + assertEquals(Arrays.asList(null, + "/test-documents.tar/testEXCEL.xls", + "/test-documents.tar/testHTML.html", + "/test-documents.tar/testOpenOffice2.odt/thumbnail.png", + "/test-documents.tar/testOpenOffice2.odt/thumbnail.pdf", + "/test-documents.tar/testOpenOffice2.odt", + "/test-documents.tar/testPDF.pdf", + "/test-documents.tar/testPPT.ppt", + "/test-documents.tar/testRTF.rtf", + "/test-documents.tar/testTXT.txt", + "/test-documents.tar/testWORD.doc", + "/test-documents.tar/testXML.xml", + "/test-documents.tar"), actualEmbeddedPaths); + } + @Test public void testCharLimitNoThrowOnWriteLimit() throws Exception { ParseContext context = new ParseContext(); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index cda95cc56b..11797a499e 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -368,7 +368,7 @@ public class PDFParserTest extends TikaTest { assertEquals("91", metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "height")); assertEquals("352", metadatas.get(1).get(ImageMetadataExtractor.UNKNOWN_IMG_NS + "width")); - assertNull(metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY)); + assertEquals("testPDF_JBIG2.pdf", metadatas.get(0).get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals("image0.jb2", metadatas.get(1).get(TikaCoreProperties.RESOURCE_NAME_KEY)); assertEquals(MediaType.image("x-jbig2").toString(), metadatas.get(1).get(Metadata.CONTENT_TYPE));
