This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4639 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6aed8956a44b8ebcdc05cce5a0f28e52ab7d095c Author: tallison <[email protected]> AuthorDate: Thu Jan 29 08:04:46 2026 -0500 TIKA-4639 --- .../src/main/java/org/apache/tika/parser/AutoDetectParser.java | 8 +------- .../src/main/java/org/apache/tika/parser/html/HtmlHandler.java | 1 + .../org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java | 1 + .../org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java | 1 + .../apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java | 3 +++ 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java index 64067cbad4..70fe1ad08f 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java @@ -31,7 +31,6 @@ import org.apache.tika.extractor.EmbeddedDocumentExtractorFactory; import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; -import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.mime.MediaTypeRegistry; import org.apache.tika.sax.SecureContentHandler; @@ -152,12 +151,7 @@ public class AutoDetectParser extends CompositeParser { // Automatically detect the MIME type of the document MediaType type = detector.detect(tis, metadata, context); - //update CONTENT_TYPE as long as it wasn't set by parser override - if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == null || - !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) - .equals(type.toString())) { - metadata.set(Metadata.CONTENT_TYPE, type.toString()); - } + metadata.set(Metadata.CONTENT_TYPE, type.toString()); //check for zero-byte inputstream if (tis.getOpenContainer() == null) { if (autoDetectParserConfig.getThrowOnZeroBytes()) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java index 4a3c3a31e0..1e11efdb95 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java @@ -335,6 +335,7 @@ class HtmlHandler extends TextContentHandler { Metadata m = Metadata.newInstance(context); m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.INLINE.toString()); + m.set(Metadata.CONTENT_TYPE, "text/html"); m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html"); //TODO add metadata about iframe content? EmbeddedDocumentExtractor embeddedDocumentExtractor = diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java index 15a271552b..e3d2669a0a 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java @@ -115,6 +115,7 @@ public class OutlookPSTParser implements Parser { PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild(); while (pstMail != null) { Metadata metadata = Metadata.newInstance(context); + metadata.set(Metadata.CONTENT_TYPE, PSTMailItemParser.PST_MAIL_ITEM_STRING); metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); String resourceName = pstMail.getSubject() + ".msg"; String internalPath = folderPath.endsWith("/") ? diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java index d216f232c0..0d91251cb4 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java @@ -227,6 +227,7 @@ public class PSTMailItemParser implements Parser { long sz = OutlookPSTParser.estimateSize(attachedEmail); try (TikaInputStream tis = TikaInputStream.getFromContainer(attachedEmail, sz, metadata)) { Metadata attachMetadata = Metadata.newInstance(context); + attachMetadata.set(Metadata.CONTENT_TYPE, PSTMailItemParser.PST_MAIL_ITEM_STRING); attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, PSTMailItemParser.PST_MAIL_ITEM_STRING); attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, attachedEmail.getSubject() + ".msg"); attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name()); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java index 3adb47ac82..4d12e31688 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java @@ -66,6 +66,9 @@ public class OutlookPSTParserTest extends TikaTest { assertEquals(10, metadataList.size()); Metadata m1 = metadataList.get(1); + assertEquals("application/x-tika-pst-mail-item", m1.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)); + assertEquals("application/vnd.ms-outlook", m1.get(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED)); + assertEquals("application/x-tika-pst-mail-item", m1.get(Metadata.CONTENT_TYPE)); assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME)); assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR)); assertEquals("Re: Feature Generators", m1.get(TikaCoreProperties.TITLE));
