This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4630-on-main in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6709c4a660284f18ec46c5bafe12f1ad84bee808 Author: tallison <[email protected]> AuthorDate: Fri Jan 23 15:27:27 2026 -0500 TIKA-4630 checkpoint - add INTERNAL_PATH across parsers and documentation - Add INTERNAL_PATH as fallback in RecursiveParserWrapper and FilenameUtils - Update AbstractOOXMLExtractor, OutlookPSTParser, EmailVisitor, WACZParser - Remove PST_FOLDER_PATH (migrated to INTERNAL_PATH) - Add embedded document metadata documentation - Update tests for new INTERNAL_PATH assertions Co-Authored-By: Claude Opus 4.5 <[email protected]> --- docs/src/main/asciidoc/advanced/index.adoc | 7 +- .../advanced/metadata/embedded-documents.adoc | 215 +++++++++++++++++++++ .../asciidoc/advanced/{ => metadata}/index.adoc | 14 +- docs/src/main/asciidoc/using-tika/index.adoc | 6 + .../main/asciidoc/using-tika/java-api/index.adoc | 7 +- .../java/org/apache/tika/io/FilenameUtils.java | 14 +- .../main/java/org/apache/tika/metadata/PST.java | 1 - .../apache/tika/parser/RecursiveParserWrapper.java | 2 + .../tika/parser/microsoft/libpst/EmailVisitor.java | 9 +- .../microsoft/ooxml/AbstractOOXMLExtractor.java | 7 +- .../parser/microsoft/pst/OutlookPSTParser.java | 7 +- .../parser/microsoft/libpst/TestLibPstParser.java | 11 +- .../parser/microsoft/ooxml/OOXMLParserTest.java | 4 + .../parser/microsoft/pst/OutlookPSTParserTest.java | 4 +- .../org/apache/tika/parser/wacz/WACZParser.java | 1 + 15 files changed, 265 insertions(+), 44 deletions(-) diff --git a/docs/src/main/asciidoc/advanced/index.adoc b/docs/src/main/asciidoc/advanced/index.adoc index f8350c86b8..6fd0125c1a 100644 --- a/docs/src/main/asciidoc/advanced/index.adoc +++ b/docs/src/main/asciidoc/advanced/index.adoc @@ -23,9 +23,4 @@ This section covers advanced usage and internals of Apache Tika. * xref:robustness.adoc[Robustness] - Process isolation and fault tolerance when parsing untrusted content * xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how TikaInputStream handles buffering, caching, and spooling to disk - -// Add links to specific topics as they are created -// * link:custom-parsers.html[Writing Custom Parsers] -// * link:custom-detectors.html[Writing Custom Detectors] -// * link:configuration.html[Advanced Configuration] -// * link:performance.html[Performance Tuning] +* xref:metadata/index.adoc[Metadata Reference] - Documentation for Tika's metadata fields diff --git a/docs/src/main/asciidoc/advanced/metadata/embedded-documents.adoc b/docs/src/main/asciidoc/advanced/metadata/embedded-documents.adoc new file mode 100644 index 0000000000..ce57a47de1 --- /dev/null +++ b/docs/src/main/asciidoc/advanced/metadata/embedded-documents.adoc @@ -0,0 +1,215 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += Embedded Document Metadata + +When Tika parses container files (such as ZIP archives, emails, PDFs with attachments, or +Microsoft Office documents), it extracts embedded documents recursively. Tika provides +several metadata fields to help you understand and track the structure of these embedded +resources. + +== Overview + +Embedded document metadata falls into two categories: + +* *Tika-Generated Metadata* - Fields that Tika calculates during parsing to help you + understand the document structure +* *Internal File Metadata* - Fields that come directly from the container file's own + metadata storage + +== Tika-Generated Metadata + +These fields are generated by Tika during parsing and reflect the structure of embedded +resources as Tika encounters them. All fields below are defined in `TikaCoreProperties`. + +=== Structure Tracking + +`TikaCoreProperties.EMBEDDED_ID` (`X-TIKA:embedded_id`):: +A 1-indexed integer assigned by Tika to each embedded document during parsing. These IDs +are assigned in the order documents are encountered by the `RecursiveParserWrapper`. + +`TikaCoreProperties.EMBEDDED_ID_PATH` (`X-TIKA:embedded_id_path`):: +A path-like representation of the embedded document's position in the hierarchy, built +from `EMBEDDED_ID` values. For example, `/1/3` indicates that the file with `EMBEDDED_ID=3` +was an attachment within the file with `EMBEDDED_ID=1`. This is the most robust path for +tracking document structure. + +=== Path Synthesis + +`TikaCoreProperties.EMBEDDED_RESOURCE_PATH` (`X-TIKA:embedded_resource_path`):: +A synthetic path generated by concatenating file names (from `RESOURCE_NAME_KEY`) for each +level of embedding. This provides a human-readable path through the document hierarchy. ++ +WARNING: Do not use this field for creating directory structures to write out attachments. +There may be path collisions, illegal characters, or zip slip vulnerabilities. Use `EMBEDDED_ID_PATH` +for reliable path tracking. + +`TikaCoreProperties.FINAL_EMBEDDED_RESOURCE_PATH` (`X-TIKA:final_embedded_resource_path`):: +Similar to `EMBEDDED_RESOURCE_PATH`, but calculated at the end of the full parse rather +than during parsing. For some parsers, an embedded file's name isn't known until after its +child files have been parsed. This field may have fewer "unknown" file names than +`EMBEDDED_RESOURCE_PATH`, but the synthetic names (e.g., `embedded-1`) are not correlated +between the two fields. + +=== Resource Naming + +`TikaCoreProperties.RESOURCE_NAME_KEY` (`X-TIKA:resourceName`):: +The "name" of the resource. Tika makes a best effort to determine a meaningful name for +each embedded resource. When a name cannot be determined from the container file's +metadata, Tika falls back to synthetic names such as `embedded-1.jpeg`. ++ +NOTE: In Tika 3.x, this field may or may not include path information depending on the +parser. In 4.x, use `INTERNAL_PATH` for the full path as stored in the container. + +== Internal File Metadata + +These fields contain metadata that is stored within the container file itself, not +generated by Tika. All fields below are defined in `TikaCoreProperties`. + +=== Path and Location + +`TikaCoreProperties.INTERNAL_PATH` (`X-TIKA:internalPath`):: +The path including file name as literally stored within the container file (e.g., in a +TAR, ZIP, or PST file). This is distinct from `EMBEDDED_RESOURCE_PATH` in two ways: ++ +1. It is the actual metadata from the container, not synthetically generated +2. It may include folder/directory information that the container preserves + +`TikaCoreProperties.ORIGINAL_RESOURCE_NAME` (`X-TIKA:origResourceName`):: +For some file formats, this contains the original path where the file was stored on the +creator's system before being embedded. For example, older `.doc` files and `.xlsx` files +may store the original file system path from the author's computer. + +== Microsoft-Specific Metadata + +Microsoft Office formats use additional identifiers for embedded objects. + +`TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID` (`X-TIKA:embeddedRelationshipId`):: +A Microsoft-specific identifier used internally to reference embedded objects within +Office documents. This is the relationship ID from the Office Open XML or OLE structure. + +`Office.EMBEDDED_STORAGE_CLASS_ID` (`msoffice:embeddedStorageClassId`):: +A UUID that identifies the class of embedded object in Microsoft formats. While not +exactly a MIME type, it provides similar information about what type of object is +embedded. Defined in the `Office` metadata class. + +== Quick Reference + +[cols="2,2,1"] +|=== +|Property |Metadata Key |Source + +|`EMBEDDED_ID` +|`X-TIKA:embedded_id` +|Tika-generated + +|`EMBEDDED_ID_PATH` +|`X-TIKA:embedded_id_path` +|Tika-generated + +|`EMBEDDED_RESOURCE_PATH` +|`X-TIKA:embedded_resource_path` +|Tika-generated + +|`FINAL_EMBEDDED_RESOURCE_PATH` +|`X-TIKA:final_embedded_resource_path` +|Tika-generated + +|`RESOURCE_NAME_KEY` +|`X-TIKA:resourceName` +|Tika-generated + +|`INTERNAL_PATH` +|`X-TIKA:internalPath` +|From container + +|`ORIGINAL_RESOURCE_NAME` +|`X-TIKA:origResourceName` +|From container + +|`EMBEDDED_RELATIONSHIP_ID` +|`X-TIKA:embeddedRelationshipId` +|From container (MS) + +|`Office.EMBEDDED_STORAGE_CLASS_ID` +|`msoffice:embeddedStorageClassId` +|From container (MS) +|=== + +== Example: Understanding the Difference + +Consider a ZIP file `archive.zip` containing `reports/Q1/sales.xlsx`, where the spreadsheet +itself contains an embedded image: + +[cols="1,2,2"] +|=== +|Document |Field |Value + +.5+|Container (`archive.zip`) +|`EMBEDDED_ID` +|_(not set - this is the root document)_ + +|`EMBEDDED_ID_PATH` +|_(not set)_ + +|`INTERNAL_PATH` +|_(not set)_ + +|`RESOURCE_NAME_KEY` +|`archive.zip` + +|`EMBEDDED_RESOURCE_PATH` +|_(not set)_ + +.5+|Spreadsheet (`sales.xlsx`) +|`EMBEDDED_ID` +|`1` + +|`EMBEDDED_ID_PATH` +|`/1` + +|`INTERNAL_PATH` +|`reports/Q1/sales.xlsx` (from ZIP entry) + +|`RESOURCE_NAME_KEY` +|`sales.xlsx` + +|`EMBEDDED_RESOURCE_PATH` +|`/sales.xlsx` + +.5+|Embedded image in spreadsheet +|`EMBEDDED_ID` +|`2` + +|`EMBEDDED_ID_PATH` +|`/1/2` (embedded in file with ID=1) + +|`INTERNAL_PATH` +|`xl/media/image1.png` (from XLSX structure) + +|`RESOURCE_NAME_KEY` +|`image1.png` + +|`EMBEDDED_RESOURCE_PATH` +|`/sales.xlsx/image1.png` +|=== + +Key observations: + +* `INTERNAL_PATH` preserves the full directory structure as it was stored from each container +* `EMBEDDED_RESOURCE_PATH` is built only from file names at each level +* `EMBEDDED_ID_PATH` `/1/2` shows that the image (ID=2) was found inside the spreadsheet (ID=1) diff --git a/docs/src/main/asciidoc/advanced/index.adoc b/docs/src/main/asciidoc/advanced/metadata/index.adoc similarity index 58% copy from docs/src/main/asciidoc/advanced/index.adoc copy to docs/src/main/asciidoc/advanced/metadata/index.adoc index f8350c86b8..f99bb69a63 100644 --- a/docs/src/main/asciidoc/advanced/index.adoc +++ b/docs/src/main/asciidoc/advanced/metadata/index.adoc @@ -15,17 +15,11 @@ // limitations under the License. // -= Advanced Topics += Metadata Reference -This section covers advanced usage and internals of Apache Tika. +This section documents the metadata fields that Tika produces during parsing. == Topics -* xref:robustness.adoc[Robustness] - Process isolation and fault tolerance when parsing untrusted content -* xref:spooling.adoc[TikaInputStream and Spooling] - Understanding how TikaInputStream handles buffering, caching, and spooling to disk - -// Add links to specific topics as they are created -// * link:custom-parsers.html[Writing Custom Parsers] -// * link:custom-detectors.html[Writing Custom Detectors] -// * link:configuration.html[Advanced Configuration] -// * link:performance.html[Performance Tuning] +* xref:embedded-documents.adoc[Embedded Document Metadata] - Understanding how Tika tracks + attachments and embedded resources within container files diff --git a/docs/src/main/asciidoc/using-tika/index.adoc b/docs/src/main/asciidoc/using-tika/index.adoc index ada34abc4c..2f13102e82 100644 --- a/docs/src/main/asciidoc/using-tika/index.adoc +++ b/docs/src/main/asciidoc/using-tika/index.adoc @@ -63,3 +63,9 @@ Use Tika via gRPC protocol. Best for high-performance, cross-language communicat For processing large volumes of documents, see xref:../pipes/index.adoc[Tika Pipes], which provides fault-tolerant, scalable document processing and works with all of the above integration methods. + +== Understanding the Output + +xref:../advanced/metadata/embedded-documents.adoc[Embedded Document Metadata]:: +Learn how Tika tracks and reports metadata for embedded documents (attachments, images, +and other resources contained within files). diff --git a/docs/src/main/asciidoc/using-tika/java-api/index.adoc b/docs/src/main/asciidoc/using-tika/java-api/index.adoc index 703a2cf2c2..8ab2b22291 100644 --- a/docs/src/main/asciidoc/using-tika/java-api/index.adoc +++ b/docs/src/main/asciidoc/using-tika/java-api/index.adoc @@ -31,8 +31,5 @@ xref:getting-started.adoc[Getting Started] for guidance on choosing the right ap == Topics * xref:getting-started.adoc[Getting Started] - Recommendations and PipesForkParser usage - -// Add links to specific topics as they are created -// * link:parsing.html[Parsing Documents] -// * link:detection.html[Content Detection] -// * link:configuration.html[Configuration] +* xref:../../advanced/metadata/embedded-documents.adoc[Embedded Document Metadata] - + Understanding attachment and embedded resource tracking diff --git a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java index b06c843c48..b2e2f5d878 100644 --- a/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java +++ b/tika-core/src/main/java/org/apache/tika/io/FilenameUtils.java @@ -292,12 +292,14 @@ public class FilenameUtils { //may return null private static String getEmbeddedPath(Metadata metadata) { - //potentially look for other values in embedded path or original file name, etc... - //maybe different fallback order? String path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (! StringUtils.isBlank(path)) { return path; } + path = metadata.get(TikaCoreProperties.INTERNAL_PATH); + if (! StringUtils.isBlank(path)) { + return path; + } path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (! StringUtils.isBlank(path)) { return path; @@ -311,22 +313,22 @@ public class FilenameUtils { //this tries for resource name first, and then backs off to path private static String getEmbeddedName(Metadata metadata) { - //potentially look for other values in embedded path or original file name, etc... - //maybe different fallback order? String path = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); if (! StringUtils.isBlank(path)) { return path; } + path = metadata.get(TikaCoreProperties.INTERNAL_PATH); + if (! StringUtils.isBlank(path)) { + return path; + } path = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); if (! StringUtils.isBlank(path)) { return path; } - path = metadata.get(TikaCoreProperties.EMBEDDED_RESOURCE_PATH); if (! StringUtils.isBlank(path)) { return path; } - return metadata.get(TikaCoreProperties.ORIGINAL_RESOURCE_NAME); } diff --git a/tika-core/src/main/java/org/apache/tika/metadata/PST.java b/tika-core/src/main/java/org/apache/tika/metadata/PST.java index 7847e7f5c2..860e3fea77 100644 --- a/tika-core/src/main/java/org/apache/tika/metadata/PST.java +++ b/tika-core/src/main/java/org/apache/tika/metadata/PST.java @@ -19,7 +19,6 @@ package org.apache.tika.metadata; public interface PST { String PST_PREFIX = "pst:"; - Property PST_FOLDER_PATH = Property.internalText(PST_PREFIX + "folderPath"); Property DESCRIPTOR_NODE_ID = Property.internalText(PST_PREFIX + "discriptorNodeId"); Property IS_VALID = Property.internalBoolean(PST_PREFIX + "isValid"); } diff --git a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java index 07159dba01..3fcab929ec 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java +++ b/tika-core/src/main/java/org/apache/tika/parser/RecursiveParserWrapper.java @@ -185,6 +185,8 @@ public class RecursiveParserWrapper extends ParserDecorator { String objectName = ""; if (metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY) != null) { objectName = metadata.get(TikaCoreProperties.RESOURCE_NAME_KEY); + } else if (metadata.get(TikaCoreProperties.INTERNAL_PATH) != null) { + objectName = FilenameUtils.getName(metadata.get(TikaCoreProperties.INTERNAL_PATH)); } else if (metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID) != null) { objectName = metadata.get(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID); } else if (metadata.get(TikaCoreProperties.VERSION_NUMBER) != null) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java index 30070bc23e..6d5f8c06d2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/libpst/EmailVisitor.java @@ -28,7 +28,7 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PST; +import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.ParseContext; import org.apache.tika.sax.XHTMLContentHandler; @@ -73,10 +73,11 @@ public class EmailVisitor implements FileVisitor<Path> { private void process(Path file) throws IOException { Metadata emailMetadata = new Metadata(); - String pstPath = root - .relativize(file.getParent()) + String internalPath = root + .relativize(file) .toString(); - emailMetadata.set(PST.PST_FOLDER_PATH, pstPath); + emailMetadata.set(TikaCoreProperties.INTERNAL_PATH, internalPath); + emailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, file.getFileName().toString()); try (TikaInputStream tis = TikaInputStream.get(file)) { try { embeddedDocumentExtractor.parseEmbedded(tis, xhtml, emailMetadata, new ParseContext(), true); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java index e357388f17..e1e8a24503 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/ooxml/AbstractOOXMLExtractor.java @@ -55,6 +55,7 @@ import org.apache.tika.exception.TikaException; import org.apache.tika.exception.WriteLimitReachedException; import org.apache.tika.extractor.EmbeddedDocumentExtractor; import org.apache.tika.extractor.EmbeddedDocumentUtil; +import org.apache.tika.io.FilenameUtils; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Office; @@ -183,7 +184,9 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { try (InputStream tStream = tPart.getInputStream()) { Metadata thumbnailMetadata = new Metadata(); String thumbName = tPart.getPartName().getName(); - thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, thumbName); + thumbnailMetadata.set(TikaCoreProperties.INTERNAL_PATH, thumbName); + thumbnailMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, + FilenameUtils.getName(thumbName)); AttributesImpl attributes = new AttributesImpl(); attributes.addAttribute(XHTML, "class", "class", "CDATA", "embedded"); @@ -348,6 +351,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); + metadata.set(TikaCoreProperties.INTERNAL_PATH, part.getPartName().getName()); DirectoryNode root = fs.getRoot(); POIFSDocumentType type = POIFSDocumentType.detectType(root); @@ -454,6 +458,7 @@ public abstract class AbstractOOXMLExtractor implements OOXMLExtractor { metadata.set(TikaCoreProperties.EMBEDDED_RELATIONSHIP_ID, rel); metadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, embeddedResourceType.name()); + metadata.set(TikaCoreProperties.INTERNAL_PATH, part.getPartName().getName()); // Get the name updateResourceName(part, embeddedPartMetadata, metadata); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index 44d470cd32..485c96eeef 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -116,8 +116,11 @@ public class OutlookPSTParser implements Parser { while (pstMail != null) { Metadata metadata = new Metadata(); metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); - metadata.set(PST.PST_FOLDER_PATH, folderPath); - metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, pstMail.getSubject() + ".msg"); + String resourceName = pstMail.getSubject() + ".msg"; + String internalPath = folderPath.endsWith("/") ? + folderPath + resourceName : folderPath + "/" + resourceName; + metadata.set(TikaCoreProperties.INTERNAL_PATH, internalPath); + metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, resourceName); long length = estimateSize(pstMail); try (TikaInputStream tis = TikaInputStream.getFromContainer(pstMail, length, metadata)) { embeddedExtractor.parseEmbedded(tis, handler, metadata, new ParseContext(), true); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java index cc75a774d4..8bb674786f 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/libpst/TestLibPstParser.java @@ -29,7 +29,6 @@ import org.apache.tika.config.loader.TikaLoader; import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.Parser; @@ -61,9 +60,8 @@ public class TestLibPstParser extends TikaTest { for (int i = 1; i < metadataList.size(); i++) { String path = metadataList .get(i) - .get(PST.PST_FOLDER_PATH); - if (path != null) { - assertEquals("hong-thai.nguyen", path); + .get(TikaCoreProperties.INTERNAL_PATH); + if (path != null && path.startsWith("hong-thai.nguyen/")) { validPaths++; } } @@ -100,9 +98,8 @@ public class TestLibPstParser extends TikaTest { for (int i = 1; i < metadataList.size(); i++) { String path = metadataList .get(i) - .get(PST.PST_FOLDER_PATH); - if (path != null) { - assertEquals("hong-thai.nguyen", path); + .get(TikaCoreProperties.INTERNAL_PATH); + if (path != null && path.startsWith("hong-thai.nguyen/")) { validPaths++; } } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java index 1b7b08db50..2538f3b7b2 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/ooxml/OOXMLParserTest.java @@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.ooxml; import static java.nio.charset.StandardCharsets.UTF_8; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertFalse; +import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertThrows; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assertions.fail; @@ -1704,6 +1705,9 @@ public class OOXMLParserTest extends MultiThreadedTikaTest { assertEquals("audio/mpeg", metadataList.get(1).get(Metadata.CONTENT_TYPE)); assertEquals("image/png", metadataList.get(2).get(Metadata.CONTENT_TYPE)); assertEquals("image/jpeg", metadataList.get(3).get(Metadata.CONTENT_TYPE)); + // Verify INTERNAL_PATH is set for embedded media + assertNotNull(metadataList.get(1).get(TikaCoreProperties.INTERNAL_PATH)); + assertTrue(metadataList.get(1).get(TikaCoreProperties.INTERNAL_PATH).contains("/ppt/media/")); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index e73c6c9fef..3adb47ac82 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -29,7 +29,6 @@ import org.apache.tika.TikaTest; import org.apache.tika.metadata.MAPI; import org.apache.tika.metadata.Message; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.PST; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.Parser; @@ -77,7 +76,8 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals("Jörn Kottmann", m1.get(MAPI.FROM_REPRESENTING_NAME)); assertEquals("[email protected]", m1.get(MAPI.FROM_REPRESENTING_EMAIL)); assertEquals("NOTE", m1.get(MAPI.MESSAGE_CLASS)); - assertEquals("/Début du fichier de données Outlook", m1.get(PST.PST_FOLDER_PATH)); + assertEquals("/Début du fichier de données Outlook/Re: Feature Generators.msg", + m1.get(TikaCoreProperties.INTERNAL_PATH)); //test that subject is making it into the xhtml assertContains("<meta name=\"dc:subject\" content=\"Re: Feature Generators\"", m1.get(TikaCoreProperties.TIKA_CONTENT)); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java index 838ce3f5bf..57306fa5a6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-webarchive-module/src/main/java/org/apache/tika/parser/wacz/WACZParser.java @@ -113,6 +113,7 @@ public class WACZParser implements Parser { String name, XHTMLContentHandler xhtml, Metadata parentMetadata, EmbeddedDocumentExtractor ex) throws IOException, SAXException { Metadata metadata = new Metadata(); + metadata.set(TikaCoreProperties.INTERNAL_PATH, zae.getName()); metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, name); metadata.set(Metadata.CONTENT_LENGTH, Long.toString(zae.getSize())); try (TikaInputStream tis = TikaInputStream.get(getMaybeGzipInputStream(TikaInputStream.get(zais)))) {
