This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 19eb31632f TIKA-4631 -- add a detect/no-parse option to pipes (#2549)
19eb31632f is described below
commit 19eb31632fd52943f8ce7b72c287c9d535364f3b
Author: Tim Allison <[email protected]>
AuthorDate: Fri Jan 23 10:29:28 2026 -0500
TIKA-4631 -- add a detect/no-parse option to pipes (#2549)
---
.../java/org/apache/tika/pipes/api/ParseMode.java | 13 +++-
.../tika/pipes/core/server/ParseHandler.java | 13 +++-
.../tika/pipes/fork/PipesForkParserTest.java | 76 ++++++++++++++++++++++
3 files changed, 99 insertions(+), 3 deletions(-)
diff --git
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
index edd82729da..4a6887ca2c 100644
---
a/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
+++
b/tika-pipes/tika-pipes-api/src/main/java/org/apache/tika/pipes/api/ParseMode.java
@@ -42,7 +42,16 @@ public enum ParseMode {
* in tika-server. The result is a single metadata object with concatenated
* content from all documents.
*/
- CONCATENATE;
+ CONCATENATE,
+
+ /**
+ * Performs digest (if configured) and content type detection only.
+ * <p>
+ * No parsing occurs - embedded documents are not extracted and no content
+ * is returned. Use this mode when you only need file identification
+ * (mime type, hash) without text extraction.
+ */
+ NO_PARSE;
/**
* Parses a string to a ParseMode enum value.
@@ -61,7 +70,7 @@ public enum ParseMode {
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"Invalid parse mode: '" + modeString + "'. " +
- "Must be one of: RMETA, CONCATENATE");
+ "Must be one of: RMETA, CONCATENATE, NO_PARSE");
}
}
}
diff --git
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
index a28b2c15dc..4330bedd99 100644
---
a/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
+++
b/tika-pipes/tika-pipes-core/src/main/java/org/apache/tika/pipes/core/server/ParseHandler.java
@@ -87,7 +87,9 @@ class ParseHandler {
//this adds the EmbeddedDocumentByteStore to the parsecontext
ParseMode parseMode = getParseMode(parseContext);
ContentHandlerFactory contentHandlerFactory =
getContentHandlerFactory(parseContext);
- if (parseMode == ParseMode.RMETA) {
+ if (parseMode == ParseMode.NO_PARSE) {
+ metadataList = detectOnly(fetchEmitTuple, stream, metadata,
parseContext);
+ } else if (parseMode == ParseMode.RMETA) {
metadataList =
parseRecursive(fetchEmitTuple, contentHandlerFactory,
stream, metadata, parseContext);
} else {
@@ -157,6 +159,15 @@ class ParseHandler {
return metadata;
}
+ /**
+ * Performs digest (if configured) and content type detection only,
without parsing.
+ */
+ private List<Metadata> detectOnly(FetchEmitTuple fetchEmitTuple,
TikaInputStream stream,
+ Metadata metadata, ParseContext
parseContext) {
+ _preParse(fetchEmitTuple, stream, metadata, parseContext);
+ return Collections.singletonList(metadata);
+ }
+
public List<Metadata> parseRecursive(FetchEmitTuple fetchEmitTuple,
ContentHandlerFactory
contentHandlerFactory, TikaInputStream stream,
Metadata metadata, ParseContext
parseContext) throws InterruptedException {
diff --git
a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
index 34e56552b3..c9e49f0a4c 100644
---
a/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
+++
b/tika-pipes/tika-pipes-fork-parser/src/test/java/org/apache/tika/pipes/fork/PipesForkParserTest.java
@@ -195,6 +195,82 @@ public class PipesForkParserTest {
}
}
+ @Test
+ public void testNoParseMode() throws Exception {
+ // Create a simple test file
+ Path testFile = tempDir.resolve("test_no_parse.txt");
+ String content = "This content should NOT be extracted in NO_PARSE
mode.";
+ Files.writeString(testFile, content);
+
+ PipesForkParserConfig config = new PipesForkParserConfig()
+ .setPluginsDir(PLUGINS_DIR)
+ .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
+ .setParseMode(ParseMode.NO_PARSE)
+ .setTimeoutMillis(60000);
+
+ try (PipesForkParser parser = new PipesForkParser(config);
+ TikaInputStream tis = TikaInputStream.get(testFile)) {
+ PipesForkResult result = parser.parse(tis);
+
+ assertTrue(result.isSuccess(), "Parse should succeed. Status: " +
result.getStatus()
+ + ", message: " + result.getMessage());
+
+ // In NO_PARSE mode, there should be exactly one metadata object
+ List<Metadata> metadataList = result.getMetadataList();
+ assertEquals(1, metadataList.size(), "NO_PARSE mode should return
single metadata");
+
+ // Content type should be detected
+ Metadata metadata = metadataList.get(0);
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ assertNotNull(contentType, "Content type should be detected");
+ assertTrue(contentType.contains("text/plain"),
+ "Content type should be text/plain, got: " + contentType);
+
+ // No content should be extracted
+ String extractedContent = result.getContent();
+ assertTrue(extractedContent == null || extractedContent.isBlank(),
+ "NO_PARSE mode should not extract content, got: " +
extractedContent);
+ }
+ }
+
+ @Test
+ public void testNoParseModeWithZip() throws Exception {
+ // Test NO_PARSE mode with a zip file - should NOT extract embedded
files
+ Path testZip = createZipWithEmbeddedFiles("test_no_parse.zip",
+ "embedded1.txt", "Content from first embedded file",
+ "embedded2.txt", "Content from second embedded file");
+
+ PipesForkParserConfig config = new PipesForkParserConfig()
+ .setPluginsDir(PLUGINS_DIR)
+ .setHandlerType(BasicContentHandlerFactory.HANDLER_TYPE.TEXT)
+ .setParseMode(ParseMode.NO_PARSE)
+ .setTimeoutMillis(60000);
+
+ try (PipesForkParser parser = new PipesForkParser(config);
+ TikaInputStream tis = TikaInputStream.get(testZip)) {
+ PipesForkResult result = parser.parse(tis);
+
+ assertTrue(result.isSuccess(), "Parse should succeed");
+
+ // Should have exactly one metadata object (no embedded file
extraction)
+ List<Metadata> metadataList = result.getMetadataList();
+ assertEquals(1, metadataList.size(),
+ "NO_PARSE mode should return only container metadata, not
embedded files");
+
+ // Content type should be detected as zip
+ Metadata metadata = metadataList.get(0);
+ String contentType = metadata.get(Metadata.CONTENT_TYPE);
+ assertNotNull(contentType, "Content type should be detected");
+ assertTrue(contentType.contains("zip"),
+ "Content type should be zip, got: " + contentType);
+
+ // No content should be extracted
+ String extractedContent = result.getContent();
+ assertTrue(extractedContent == null || extractedContent.isBlank(),
+ "NO_PARSE mode should not extract content");
+ }
+ }
+
@Test
public void testRmetaModeWithEmbedded() throws Exception {
Path testZip = createZipWithEmbeddedFiles("test_rmeta_embedded.zip",