This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4639
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6aed8956a44b8ebcdc05cce5a0f28e52ab7d095c
Author: tallison <[email protected]>
AuthorDate: Thu Jan 29 08:04:46 2026 -0500

    TIKA-4639
---
 .../src/main/java/org/apache/tika/parser/AutoDetectParser.java    | 8 +-------
 .../src/main/java/org/apache/tika/parser/html/HtmlHandler.java    | 1 +
 .../org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java    | 1 +
 .../org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java   | 1 +
 .../apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java    | 3 +++
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 64067cbad4..70fe1ad08f 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -31,7 +31,6 @@ import 
org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
 import org.apache.tika.extractor.ParsingEmbeddedDocumentExtractorFactory;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.mime.MediaTypeRegistry;
 import org.apache.tika.sax.SecureContentHandler;
@@ -152,12 +151,7 @@ public class AutoDetectParser extends CompositeParser {
 
         // Automatically detect the MIME type of the document
         MediaType type = detector.detect(tis, metadata, context);
-        //update CONTENT_TYPE as long as it wasn't set by parser override
-        if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) == 
null ||
-                !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)
-                        .equals(type.toString())) {
-            metadata.set(Metadata.CONTENT_TYPE, type.toString());
-        }
+        metadata.set(Metadata.CONTENT_TYPE, type.toString());
         //check for zero-byte inputstream
         if (tis.getOpenContainer() == null) {
             if (autoDetectParserConfig.getThrowOnZeroBytes()) {
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 4a3c3a31e0..1e11efdb95 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -335,6 +335,7 @@ class HtmlHandler extends TextContentHandler {
         Metadata m = Metadata.newInstance(context);
         m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
                 TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+        m.set(Metadata.CONTENT_TYPE, "text/html");
         m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html");
         //TODO add metadata about iframe content?
         EmbeddedDocumentExtractor embeddedDocumentExtractor =
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 15a271552b..e3d2669a0a 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -115,6 +115,7 @@ public class OutlookPSTParser implements Parser {
             PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
             while (pstMail != null) {
                 Metadata metadata = Metadata.newInstance(context);
+                metadata.set(Metadata.CONTENT_TYPE, 
PSTMailItemParser.PST_MAIL_ITEM_STRING);
                 metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, 
PSTMailItemParser.PST_MAIL_ITEM_STRING);
                 String resourceName = pstMail.getSubject() + ".msg";
                 String internalPath = folderPath.endsWith("/") ?
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index d216f232c0..0d91251cb4 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -227,6 +227,7 @@ public class PSTMailItemParser implements Parser {
             long sz = OutlookPSTParser.estimateSize(attachedEmail);
             try (TikaInputStream tis = 
TikaInputStream.getFromContainer(attachedEmail, sz, metadata)) {
                 Metadata attachMetadata = Metadata.newInstance(context);
+                attachMetadata.set(Metadata.CONTENT_TYPE, 
PSTMailItemParser.PST_MAIL_ITEM_STRING);
                 
attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, 
PSTMailItemParser.PST_MAIL_ITEM_STRING);
                 attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, 
attachedEmail.getSubject() + ".msg");
                 attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE, 
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
diff --git 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index 3adb47ac82..4d12e31688 100644
--- 
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++ 
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -66,6 +66,9 @@ public class OutlookPSTParserTest extends TikaTest {
         assertEquals(10, metadataList.size());
 
         Metadata m1 = metadataList.get(1);
+        assertEquals("application/x-tika-pst-mail-item", 
m1.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE));
+        assertEquals("application/vnd.ms-outlook", 
m1.get(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED));
+        assertEquals("application/x-tika-pst-mail-item", 
m1.get(Metadata.CONTENT_TYPE));
         assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME));
         assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR));
         assertEquals("Re: Feature Generators", 
m1.get(TikaCoreProperties.TITLE));

Reply via email to