This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new f73ea2f16a TIKA-4639 (#2559)
f73ea2f16a is described below
commit f73ea2f16ab52520e92d111b05ac8d3901df81be
Author: Tim Allison <[email protected]>
AuthorDate: Thu Jan 29 12:58:34 2026 -0500
TIKA-4639 (#2559)
---
.../src/main/java/org/apache/tika/parser/AutoDetectParser.java | 8 +-------
.../src/main/java/org/apache/tika/parser/html/HtmlHandler.java | 1 +
.../org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java | 1 +
.../org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java | 1 +
.../apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java | 3 +++
5 files changed, 7 insertions(+), 7 deletions(-)
diff --git
a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index ae9a33e170..752c0c2e35 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -31,7 +31,6 @@ import
org.apache.tika.extractor.EmbeddedDocumentExtractorFactory;
import org.apache.tika.extractor.StandardExtractorFactory;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
-import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.mime.MediaTypeRegistry;
import org.apache.tika.sax.SecureContentHandler;
@@ -152,12 +151,7 @@ public class AutoDetectParser extends CompositeParser {
// Automatically detect the MIME type of the document
MediaType type = detector.detect(tis, metadata, context);
- //update CONTENT_TYPE as long as it wasn't set by parser override
- if (metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE) ==
null ||
- !metadata.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE)
- .equals(type.toString())) {
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
- }
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
//check for zero-byte inputstream
if (tis.getOpenContainer() == null) {
if (autoDetectParserConfig.getThrowOnZeroBytes()) {
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
index 55a967b5d4..94a4593a95 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-html-module/src/main/java/org/apache/tika/parser/html/HtmlHandler.java
@@ -335,6 +335,7 @@ class HtmlHandler extends TextContentHandler {
Metadata m = Metadata.newInstance(context);
m.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.INLINE.toString());
+ m.set(Metadata.CONTENT_TYPE, "text/html");
m.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE, "text/html");
//TODO add metadata about iframe content?
EmbeddedDocumentExtractor embeddedDocumentExtractor =
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
index 62eb4d89b2..0128ceabc0 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParser.java
@@ -115,6 +115,7 @@ public class OutlookPSTParser implements Parser {
PSTMessage pstMail = (PSTMessage) pstFolder.getNextChild();
while (pstMail != null) {
Metadata metadata = Metadata.newInstance(context);
+ metadata.set(Metadata.CONTENT_TYPE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
metadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
String resourceName = pstMail.getSubject() + ".msg";
String internalPath = folderPath.endsWith("/") ?
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
index ebe1abd3cb..dd1bddb386 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/pst/PSTMailItemParser.java
@@ -227,6 +227,7 @@ public class PSTMailItemParser implements Parser {
long sz = OutlookPSTParser.estimateSize(attachedEmail);
try (TikaInputStream tis =
TikaInputStream.getFromContainer(attachedEmail, sz, metadata)) {
Metadata attachMetadata = Metadata.newInstance(context);
+ attachMetadata.set(Metadata.CONTENT_TYPE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
attachMetadata.set(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE,
PSTMailItemParser.PST_MAIL_ITEM_STRING);
attachMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
attachedEmail.getSubject() + ".msg");
attachMetadata.set(TikaCoreProperties.EMBEDDED_RESOURCE_TYPE,
TikaCoreProperties.EmbeddedResourceType.ATTACHMENT.name());
diff --git
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
index 3adb47ac82..4d12e31688 100644
---
a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
+++
b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/test/java/org/apache/tika/parser/microsoft/pst/OutlookPSTParserTest.java
@@ -66,6 +66,9 @@ public class OutlookPSTParserTest extends TikaTest {
assertEquals(10, metadataList.size());
Metadata m1 = metadataList.get(1);
+ assertEquals("application/x-tika-pst-mail-item",
m1.get(TikaCoreProperties.CONTENT_TYPE_PARSER_OVERRIDE));
+ assertEquals("application/vnd.ms-outlook",
m1.get(TikaCoreProperties.CONTENT_TYPE_MAGIC_DETECTED));
+ assertEquals("application/x-tika-pst-mail-item",
m1.get(Metadata.CONTENT_TYPE));
assertEquals("Jörn Kottmann", m1.get(Message.MESSAGE_FROM_NAME));
assertEquals("Jörn Kottmann", m1.get(TikaCoreProperties.CREATOR));
assertEquals("Re: Feature Generators",
m1.get(TikaCoreProperties.TITLE));