This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-4716
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 6bb95951d7de0c9750781df389fc16b3070bd785
Author: tallison <[email protected]>
AuthorDate: Thu Apr 9 08:30:05 2026 -0400

    TIKA-4716 -- refactor external parser for the three output options: stderr, 
stdout, file
---
 .../configuration/parsers/external-parser.adoc     |  54 +++++--
 .../tika/parser/external/ExternalParser.java       | 173 +++++++++++++--------
 .../tika/parser/external/ExternalParserConfig.java |  48 ++++--
 .../tika/parser/external/ExternalParserTest.java   |  50 +++---
 .../configs/TIKA-3557-exiftool-example.json        |   3 +-
 .../src/test/resources/configs/TIKA-3557.json      |   2 +-
 .../configs/external-parser-exiftool.json          |   3 +-
 .../resources/configs/external-parser-ffmpeg.json  |   3 +-
 .../resources/configs/external-parser-multi.json   |   6 +-
 .../resources/configs/external-parser-sox.json     |   3 +-
 10 files changed, 217 insertions(+), 128 deletions(-)

diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc 
b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
index 25af4049d7..281bd6d12d 100644
--- a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
+++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc
@@ -30,17 +30,30 @@ Each external parser can declare a `checkCommandLine` that 
verifies the tool
 is installed. The check runs lazily on first use (not at startup), and if the
 tool is not found, the parser silently disables itself.
 
-=== Output Parser vs Stderr Parser
+=== Stream Handlers
 
-External tools write useful output to different streams:
+An external process produces up to three output streams. Each can have an
+independent handler (any Tika parser):
 
-* **`outputParser`** -- processes stdout (or the output file). Use this for 
tools
-  like `exiftool` that write structured output to stdout.
-* **`stderrParser`** -- processes stderr. Use this for tools like `ffmpeg` and
-  `sox` that write metadata to stderr.
+* **`stdoutHandler`** -- processes stdout
+* **`stderrHandler`** -- processes stderr
+* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}` 
is used)
 
-Both accept any Tika parser; `regex-capture-parser` is the most common choice
-for extracting metadata via regex patterns.
+Handlers extract metadata, content, or both. `regex-capture-parser` is the
+most common choice for extracting metadata via regex patterns.
+
+=== Content Source
+
+The `contentSource` field controls which stream provides the XHTML text 
content:
+
+* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command
+* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command
+* `"stderr"` -- use stderr as the content source
+* `"none"` -- metadata-only mode, no text content extracted
+
+When a handler is configured for the content source stream, its
+ContentHandler output becomes the XHTML content. When no handler is
+configured, the raw bytes are written as text.
 
 == Configuration Options
 
@@ -60,13 +73,21 @@ for extracting metadata via regex patterns.
 |`List<Integer>`
 |Exit codes that indicate the tool is not available. Default: `[127]`.
 
-|`outputParser`
+|`stdoutHandler`
 |Parser config
-|Optional. Parser to process stdout or the output file.
+|Optional. Parser to process stdout.
 
-|`stderrParser`
+|`stderrHandler`
 |Parser config
-|Optional. Parser to process stderr (for metadata extraction).
+|Optional. Parser to process stderr.
+
+|`outputFileHandler`
+|Parser config
+|Optional. Parser to process the output file.
+
+|`contentSource`
+|`String`
+|Which stream provides XHTML content: `"stdout"`, `"stderr"`, `"outputFile"`, 
or `"none"`. Default depends on command.
 
 |`returnStdout`
 |`boolean`
@@ -93,7 +114,7 @@ for extracting metadata via regex patterns.
 
 === Exiftool (metadata from stdout)
 
-Extracts metadata from media files using `exiftool`. The `outputParser` uses
+Extracts metadata from media files using `exiftool`. The `stdoutHandler` uses
 `regex-capture-parser` to extract key-value pairs from exiftool's stdout.
 
 [source,json]
@@ -105,7 +126,7 @@ icon:github[] 
https://github.com/apache/tika/blob/main/tika-serialization/src/te
 === FFmpeg (metadata from stderr)
 
 Extracts audio/video metadata from `ffmpeg -i` output. FFmpeg writes metadata
-to stderr, so this uses `stderrParser` instead of `outputParser`.
+to stderr, so this uses `stderrHandler`.
 
 [source,json]
 ----
@@ -146,8 +167,9 @@ In Tika 4.x:
 
 * External parsers must be explicitly configured in JSON -- no auto-discovery.
 * The `checkCommandLine` runs lazily on first use, not at startup.
-* The `stderrParser` field replaces the inline regex-on-stderr metadata 
extraction.
-* The `external2` package has been renamed back to `external`.
+* Three independent stream handlers (`stdoutHandler`, `stderrHandler`,
+  `outputFileHandler`) replace the old `outputParser`/`stderrParser` split.
+* The `contentSource` field explicitly controls which stream provides text 
content.
 * `CompositeExternalParser`, `ExternalParsersFactory`, and the XML config
   reader have been removed.
 
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
index d6519fedb2..b66b468868 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
+++ 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java
@@ -45,7 +45,6 @@ import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.ExternalProcess;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.EmptyParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
@@ -58,10 +57,15 @@ import org.apache.tika.utils.ProcessUtils;
  * to extract text content and metadata from a given document.
  * <p>
  * This parser relies on JSON configuration rather than classpath 
auto-discovery.
- * Users can specify a parser to handle the output of the external process
- * (via {@code outputParser}) and/or a parser to extract metadata from stderr
- * (via {@code stderrParser}). An optional {@code checkCommandLine} can be
- * configured so that the parser lazily verifies the external tool is 
available.
+ * Users can specify independent handlers for each process stream:
+ * <ul>
+ *   <li>{@code stdoutHandler} — processes stdout</li>
+ *   <li>{@code stderrHandler} — processes stderr</li>
+ *   <li>{@code outputFileHandler} — processes the output file</li>
+ * </ul>
+ * The {@code contentSource} field controls which stream provides the XHTML
+ * content output. An optional {@code checkCommandLine} lazily verifies the
+ * external tool is available.
  */
 @TikaComponent
 public class ExternalParser implements Parser {
@@ -72,18 +76,24 @@ public class ExternalParser implements Parser {
 
     public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}";
 
-    private static Pattern INPUT_TOKEN_MATCHER = 
Pattern.compile("\\$\\{INPUT_FILE}");
-    private static Pattern OUTPUT_TOKEN_MATCHER = 
Pattern.compile("\\$\\{OUTPUT_FILE}");
+    private static final Pattern INPUT_TOKEN_MATCHER =
+            Pattern.compile("\\$\\{INPUT_FILE}");
+    private static final Pattern OUTPUT_TOKEN_MATCHER =
+            Pattern.compile("\\$\\{OUTPUT_FILE}");
 
     private static final Logger LOG = 
LoggerFactory.getLogger(ExternalParser.class);
 
+    private static final ContentHandler DISCARD_HANDLER =
+            new org.xml.sax.helpers.DefaultHandler();
+
     private final ExternalParserConfig config;
 
     // Cached values derived from config
     private final Set<MediaType> supportedTypes;
     private final List<String> commandLine;
-    private final Parser outputParser;
-    private final Parser stderrParser;
+    private final Parser stdoutHandler;
+    private final Parser stderrHandler;
+    private final Parser outputFileHandler;
 
     // Lazy check state
     private final String[] checkCmd;
@@ -107,14 +117,15 @@ public class ExternalParser implements Parser {
             this.supportedTypes.add(MediaType.parse(s));
         }
         this.commandLine = new ArrayList<>(config.getCommandLine());
-        this.outputParser = config.getOutputParser() != null ?
-                config.getOutputParser() : EmptyParser.INSTANCE;
-        this.stderrParser = config.getStderrParser();
+        this.stdoutHandler = config.getStdoutHandler();
+        this.stderrHandler = config.getStderrHandler();
+        this.outputFileHandler = config.getOutputFileHandler();
 
         // Set up lazy check
         if (config.getCheckCommandLine() != null && 
!config.getCheckCommandLine().isEmpty()) {
             this.checkCmd = config.getCheckCommandLine().toArray(new 
String[0]);
-            if (config.getCheckErrorCodes() != null && 
!config.getCheckErrorCodes().isEmpty()) {
+            if (config.getCheckErrorCodes() != null &&
+                    !config.getCheckErrorCodes().isEmpty()) {
                 this.checkErrorCodes = config.getCheckErrorCodes().stream()
                         .mapToInt(Integer::intValue).toArray();
             } else {
@@ -150,39 +161,39 @@ public class ExternalParser implements Parser {
     @Override
     public void parse(TikaInputStream tis, ContentHandler handler, Metadata 
metadata,
                       ParseContext context) throws IOException, SAXException, 
TikaException {
-        //this may remain null, depending on whether the external parser 
writes to a file
         Path outFile = null;
         try (TemporaryResources tmp = new TemporaryResources()) {
             Path p = tis.getPath();
             List<String> thisCommandLine = new ArrayList<>();
             Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher("");
             Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher("");
-            boolean outputFileInCommandline = false;
+            boolean hasOutputFile = false;
             for (String c : commandLine) {
                 if (inputMatcher.reset(c).find()) {
                     String updated = c.replace(INPUT_FILE_TOKEN,
-                            
ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString()));
+                            ProcessUtils.escapeCommandLine(
+                                    p.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
                 } else if (outputMatcher.reset(c).find()) {
                     outFile = Files.createTempFile("tika-external-", "");
                     String updated = c.replace(OUTPUT_FILE_TOKEN,
-                            
ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString()));
+                            ProcessUtils.escapeCommandLine(
+                                    outFile.toAbsolutePath().toString()));
                     thisCommandLine.add(updated);
-                    outputFileInCommandline = true;
+                    hasOutputFile = true;
                 } else {
                     thisCommandLine.add(c);
                 }
             }
-            FileProcessResult result = null;
-            long localTimeoutMillis = 
TimeoutLimits.getProcessTimeoutMillis(context, config.getTimeoutMs());
-            if (outputFileInCommandline) {
-                result = ProcessUtils.execute(new 
ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, config.getMaxStdOut(), 
config.getMaxStdErr());
-            } else {
-                outFile = Files.createTempFile("tika-external-", "");
-                result = ProcessUtils.execute(new 
ProcessBuilder(thisCommandLine),
-                        localTimeoutMillis, outFile, config.getMaxStdErr());
-            }
+
+            // Always capture both stdout and stderr in memory
+            long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis(
+                    context, config.getTimeoutMs());
+            FileProcessResult result = ProcessUtils.execute(
+                    new ProcessBuilder(thisCommandLine),
+                    localTimeoutMillis, config.getMaxStdOut(), 
config.getMaxStdErr());
+
+            // Set process metadata
             metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout());
             metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue());
             TikaProgressTracker.update(context);
@@ -199,17 +210,32 @@ public class ExternalParser implements Parser {
             if (config.isReturnStderr()) {
                 metadata.set(ExternalProcess.STD_ERR, result.getStderr());
             }
-            if (stderrParser != null && result.getStderr() != null
-                    && !result.getStderr().isEmpty()) {
-                try (TikaInputStream stderrStream = TikaInputStream.get(
-                        result.getStderr().getBytes(StandardCharsets.UTF_8))) {
-                    stderrParser.parse(stderrStream, new 
org.xml.sax.helpers.DefaultHandler(),
-                            metadata, context);
-                }
+
+            // Determine content source
+            String effectiveContentSource = config.getContentSource();
+            if (effectiveContentSource == null) {
+                effectiveContentSource = hasOutputFile ? "outputFile" : 
"stdout";
             }
-            XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, 
metadata, context);
+
+            XHTMLContentHandler xhtml =
+                    new XHTMLContentHandler(handler, metadata, context);
             xhtml.startDocument();
-            handleOutput(result, outFile, xhtml, metadata, context);
+
+            // Process each stream through its handler
+            handleStream(result.getStdout(), stdoutHandler,
+                    "stdout".equals(effectiveContentSource),
+                    xhtml, metadata, context);
+
+            handleStream(result.getStderr(), stderrHandler,
+                    "stderr".equals(effectiveContentSource),
+                    xhtml, metadata, context);
+
+            if (hasOutputFile && outFile != null) {
+                handleOutputFile(outFile, outputFileHandler,
+                        "outputFile".equals(effectiveContentSource),
+                        xhtml, metadata, context);
+            }
+
             xhtml.endDocument();
         } finally {
             if (outFile != null) {
@@ -218,45 +244,54 @@ public class ExternalParser implements Parser {
         }
     }
 
-    private void handleOutput(FileProcessResult result, Path outFile,
+    private void handleStream(String content, Parser handler, boolean 
isContentSource,
                               XHTMLContentHandler xhtml, Metadata metadata,
-                              ParseContext parseContext) throws SAXException, 
TikaException,
-            IOException {
-        if (outputParser == EmptyParser.INSTANCE) {
-            if (outFile != null) {
-                try (BufferedReader reader = Files.newBufferedReader(outFile)) 
{
-                    String line = reader.readLine();
-                    while (line != null) {
-                        //do we want to wrap this in <p></p> elements?
-                        xhtml.characters(line);
-                        xhtml.newline();
-                        line = reader.readLine();
-                    }
-                }
-            } else {
-                //read this in line by line and wrap <p></p> elements?
-                xhtml.characters(result.getStdout());
+                              ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (content == null || content.isEmpty()) {
+            return;
+        }
+        if (handler != null) {
+            ContentHandler target = isContentSource ?
+                    new BodyContentHandler(xhtml) : DISCARD_HANDLER;
+            try (TikaInputStream tis = TikaInputStream.get(
+                    content.getBytes(StandardCharsets.UTF_8))) {
+                handler.parse(tis, target, metadata, context);
             }
-        } else {
-            if (outFile != null) {
-                try (TikaInputStream tis = TikaInputStream.get(outFile)) {
-                    outputParser.parse(tis, new BodyContentHandler(xhtml), 
metadata, parseContext);
-                }
-            } else {
-                try (TikaInputStream tis = TikaInputStream.get(
-                        result.getStdout().getBytes(StandardCharsets.UTF_8))) {
-                    outputParser.parse(tis, new BodyContentHandler(xhtml), 
metadata, parseContext);
+        } else if (isContentSource) {
+            // No handler — write raw content as XHTML text
+            String[] lines = content.split("\n", -1);
+            for (int i = 0; i < lines.length; i++) {
+                xhtml.characters(lines[i]);
+                if (i < lines.length - 1) {
+                    xhtml.newline();
                 }
             }
         }
-
     }
 
-    /**
-     * Returns the output parser used to parse the external process output.
-     */
-    public Parser getOutputParser() {
-        return outputParser;
+    private void handleOutputFile(Path outFile, Parser handler,
+                                  boolean isContentSource,
+                                  XHTMLContentHandler xhtml, Metadata metadata,
+                                  ParseContext context)
+            throws IOException, SAXException, TikaException {
+        if (handler != null) {
+            ContentHandler target = isContentSource ?
+                    new BodyContentHandler(xhtml) : DISCARD_HANDLER;
+            try (TikaInputStream tis = TikaInputStream.get(outFile)) {
+                handler.parse(tis, target, metadata, context);
+            }
+        } else if (isContentSource) {
+            // No handler — write raw file content as XHTML text
+            try (BufferedReader reader = Files.newBufferedReader(outFile)) {
+                String line = reader.readLine();
+                while (line != null) {
+                    xhtml.characters(line);
+                    xhtml.newline();
+                    line = reader.readLine();
+                }
+            }
+        }
     }
 
     /**
diff --git 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
index b15c5307e0..7a91300701 100644
--- 
a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
+++ 
b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java
@@ -30,12 +30,14 @@ import org.apache.tika.parser.Parser;
  */
 public class ExternalParserConfig implements Serializable {
 
-    private static final long serialVersionUID = 1L;
+    private static final long serialVersionUID = 2L;
 
     private List<String> supportedTypes = new ArrayList<>();
     private List<String> commandLine = new ArrayList<>();
-    private Parser outputParser;
-    private Parser stderrParser;
+    private Parser stdoutHandler;
+    private Parser stderrHandler;
+    private Parser outputFileHandler;
+    private String contentSource;
     private List<String> checkCommandLine;
     private List<Integer> checkErrorCodes;
     private boolean returnStdout = false;
@@ -63,20 +65,44 @@ public class ExternalParserConfig implements Serializable {
         this.commandLine = commandLine;
     }
 
-    public Parser getOutputParser() {
-        return outputParser;
+    public Parser getStdoutHandler() {
+        return stdoutHandler;
     }
 
-    public void setOutputParser(Parser outputParser) {
-        this.outputParser = outputParser;
+    public void setStdoutHandler(Parser stdoutHandler) {
+        this.stdoutHandler = stdoutHandler;
     }
 
-    public Parser getStderrParser() {
-        return stderrParser;
+    public Parser getStderrHandler() {
+        return stderrHandler;
     }
 
-    public void setStderrParser(Parser stderrParser) {
-        this.stderrParser = stderrParser;
+    public void setStderrHandler(Parser stderrHandler) {
+        this.stderrHandler = stderrHandler;
+    }
+
+    public Parser getOutputFileHandler() {
+        return outputFileHandler;
+    }
+
+    public void setOutputFileHandler(Parser outputFileHandler) {
+        this.outputFileHandler = outputFileHandler;
+    }
+
+    /**
+     * Which stream provides the XHTML content output.
+     * <p>
+     * Valid values: {@code "stdout"}, {@code "stderr"}, {@code "outputFile"}, 
{@code "none"}.
+     * <p>
+     * If {@code null}, defaults to {@code "stdout"} when no {@code 
${OUTPUT_FILE}} token
+     * is in the command, or {@code "outputFile"} when it is.
+     */
+    public String getContentSource() {
+        return contentSource;
+    }
+
+    public void setContentSource(String contentSource) {
+        this.contentSource = contentSource;
     }
 
     public List<String> getCheckCommandLine() {
diff --git 
a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
 
b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
index 7cdbca5586..e8c9293d57 100644
--- 
a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
+++ 
b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java
@@ -18,6 +18,7 @@ package org.apache.tika.parser.external;
 
 import static org.junit.jupiter.api.Assertions.assertEquals;
 import static org.junit.jupiter.api.Assertions.assertNotNull;
+import static org.junit.jupiter.api.Assertions.assertNull;
 import static org.junit.jupiter.api.Assertions.assertTrue;
 import static org.junit.jupiter.api.Assumptions.assumeTrue;
 
@@ -37,6 +38,7 @@ import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.ParserDecorator;
 import org.apache.tika.parser.RegexCaptureParser;
+import org.apache.tika.parser.RegexCaptureParserConfig;
 
 public class ExternalParserTest extends TikaTest {
 
@@ -49,13 +51,12 @@ public class ExternalParserTest extends TikaTest {
         CompositeParser p = (CompositeParser) loader.get(Parser.class);
         assertEquals(1, p.getAllComponentParsers().size());
         Parser parser = p.getAllComponentParsers().get(0);
-        // When _mime-include is used, the parser is wrapped in a 
ParserDecorator
         ExternalParser externalParser = (parser instanceof ParserDecorator)
                 ? (ExternalParser) ((ParserDecorator) 
parser).getWrappedParser()
                 : (ExternalParser) parser;
 
-        Parser outputParser = externalParser.getOutputParser();
-        assertEquals(RegexCaptureParser.class, outputParser.getClass());
+        Parser stdoutHandler = externalParser.getConfig().getStdoutHandler();
+        assertEquals(RegexCaptureParser.class, stdoutHandler.getClass());
 
         Metadata m = new Metadata();
         ContentHandler contentHandler = new DefaultHandler();
@@ -64,34 +65,37 @@ public class ExternalParserTest extends TikaTest {
                 "Author: jumped over\n" +
                 "Created: 10/20/2024";
         try (TikaInputStream tis = 
TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) {
-            outputParser.parse(tis, contentHandler, m, new ParseContext());
+            stdoutHandler.parse(tis, contentHandler, m, new ParseContext());
         }
         assertEquals("the quick brown fox", m.get("title"));
     }
 
     @Test
     public void testConfigBasic() throws Exception {
-        assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new 
String[]{"file", "--version"}));
-        TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), 
"TIKA-3557-no-output-parser.json"));
+        assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+                new String[]{"file", "--version"}));
+        TikaLoader loader = TikaLoader.load(
+                getConfigPath(getClass(), "TIKA-3557-no-output-parser.json"));
         CompositeParser p = (CompositeParser) loader.get(Parser.class);
         assertEquals(1, p.getAllComponentParsers().size());
         Parser parser = p.getAllComponentParsers().get(0);
-        // When _mime-include is used, the parser is wrapped in a 
ParserDecorator
         ExternalParser externalParser = (parser instanceof ParserDecorator)
                 ? (ExternalParser) ((ParserDecorator) 
parser).getWrappedParser()
                 : (ExternalParser) parser;
 
+        // No handler — raw stdout becomes content (default 
contentSource=stdout)
+        assertNull(externalParser.getConfig().getStdoutHandler());
         XMLResult xmlResult = getXML("example.xml", externalParser);
-        assertContains("<body>text/xml</body>", 
xmlResult.xml.replaceAll("[\r\n]", ""));
+        assertContains("text/xml", xmlResult.xml);
     }
 
     @Test
     public void testExifTool() throws Exception {
-        assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new 
String[]{"exiftool",
-                "-ver"}));
-        TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), 
"TIKA-3557-exiftool-example.json"));
+        assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(
+                new String[]{"exiftool", "-ver"}));
+        TikaLoader loader = TikaLoader.load(
+                getConfigPath(getClass(), "TIKA-3557-exiftool-example.json"));
         Parser p = loader.loadAutoDetectParser();
-        //this was the smallest pdf we had
         List<Metadata> metadataList = 
getRecursiveMetadata("testOverlappingText.pdf", p);
         assertEquals(1, metadataList.size());
         Metadata m = metadataList.get(0);
@@ -115,8 +119,9 @@ public class ExternalParserTest extends TikaTest {
         assertNotNull(config.getCheckCommandLine());
         assertEquals(List.of("ffmpeg", "-version"), 
config.getCheckCommandLine());
         assertEquals(List.of(126, 127), config.getCheckErrorCodes());
-        assertNotNull(config.getStderrParser());
-        assertEquals(RegexCaptureParser.class, 
config.getStderrParser().getClass());
+        assertNotNull(config.getStderrHandler());
+        assertEquals(RegexCaptureParser.class, 
config.getStderrHandler().getClass());
+        assertEquals("none", config.getContentSource());
         assertTrue(config.isReturnStderr());
     }
 
@@ -134,27 +139,22 @@ public class ExternalParserTest extends TikaTest {
         ExternalParserConfig config = externalParser.getConfig();
         assertNotNull(config.getCheckCommandLine());
         assertEquals(List.of("sox", "--version"), 
config.getCheckCommandLine());
-        assertNotNull(config.getStderrParser());
-        assertEquals(RegexCaptureParser.class, 
config.getStderrParser().getClass());
+        assertNotNull(config.getStderrHandler());
+        assertEquals(RegexCaptureParser.class, 
config.getStderrHandler().getClass());
     }
 
     @Test
-    public void testStderrParserExtractsMetadata() throws Exception {
-        // Simulate what would happen with ffmpeg stderr output
-        RegexCaptureParser stderrParser = new RegexCaptureParser();
-        // Build a config with a captureMap programmatically
+    public void testStderrHandlerExtractsMetadata() throws Exception {
         String ffmpegStderr = "  Duration: 00:02:30.50, start: 0.000000, 
bitrate: 706 kb/s\n" +
                 "    Stream #0:0: Video: h264 (High), yuv420p, 1280x720, 25 
fps\n" +
                 "    Stream #0:1: Audio: aac, 44100 Hz, 2 channels, fltp\n";
 
-        // Use the regex-capture-parser with the same patterns from the ffmpeg 
config
         java.util.Map<String, String> captureMap = new 
java.util.LinkedHashMap<>();
         captureMap.put("xmpDM:duration", "\\s*Duration:\\s*([0-9:\\.]+),.*");
         captureMap.put("xmpDM:audioSampleRate",
                 "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*");
 
-        org.apache.tika.parser.RegexCaptureParserConfig regexConfig =
-                new org.apache.tika.parser.RegexCaptureParserConfig();
+        RegexCaptureParserConfig regexConfig = new RegexCaptureParserConfig();
         regexConfig.setCaptureMap(captureMap);
         RegexCaptureParser parser = new RegexCaptureParser(regexConfig);
 
@@ -179,13 +179,13 @@ public class ExternalParserTest extends TikaTest {
         CompositeParser composite = (CompositeParser) loader.get(Parser.class);
         List<Parser> allParsers = composite.getAllComponentParsers();
 
-        // Should have two separate external parser instances
         assertEquals(2, allParsers.size(), "Expected 2 external parsers but 
got " +
                 allParsers.size() + ": " + allParsers);
 
         // Test the exiftool parser on a PDF
         Parser autoDetect = loader.loadAutoDetectParser();
-        List<Metadata> metadataList = 
getRecursiveMetadata("testOverlappingText.pdf", autoDetect);
+        List<Metadata> metadataList = 
getRecursiveMetadata("testOverlappingText.pdf",
+                autoDetect);
         assertEquals(1, metadataList.size());
         Metadata m = metadataList.get(0);
         assertEquals("application/pdf", m.get("exiftool:MIMEType"));
diff --git 
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json 
b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
index 8e61008654..73ed0ba1ef 100644
--- 
a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
+++ 
b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json
@@ -4,7 +4,8 @@
       "external-parser": {
         "_mime-include": ["application/octet-stream"],
         "commandLine": ["exiftool", "${INPUT_FILE}"],
-        "outputParser": {
+        "contentSource": "none",
+        "stdoutHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "mime": "^MIME Type\\s+: ([^\\r\\n]+)",
diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557.json 
b/tika-serialization/src/test/resources/configs/TIKA-3557.json
index cd3af89821..a5937bedf6 100644
--- a/tika-serialization/src/test/resources/configs/TIKA-3557.json
+++ b/tika-serialization/src/test/resources/configs/TIKA-3557.json
@@ -4,7 +4,7 @@
       "external-parser": {
         "_mime-include": ["application/xml"],
         "commandLine": ["file", "-b", "--mime-type", "${INPUT_FILE}"],
-        "outputParser": {
+        "stdoutHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "title": "^Title: ([^\\r\\n]+)"
diff --git 
a/tika-serialization/src/test/resources/configs/external-parser-exiftool.json 
b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json
index a0bffc78e9..fbbed46e68 100644
--- 
a/tika-serialization/src/test/resources/configs/external-parser-exiftool.json
+++ 
b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json
@@ -11,7 +11,8 @@
         "commandLine": ["exiftool", "${INPUT_FILE}"],
         "checkCommandLine": ["exiftool", "-ver"],
         "checkErrorCodes": [126, 127],
-        "outputParser": {
+        "contentSource": "none",
+        "stdoutHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "mime": "^MIME Type\\s+: ([^\\r\\n]+)",
diff --git 
a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json 
b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
index 0aa55a5fbc..3d9dd70bba 100644
--- a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
+++ b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json
@@ -10,9 +10,10 @@
         "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"],
         "checkCommandLine": ["ffmpeg", "-version"],
         "checkErrorCodes": [126, 127],
+        "contentSource": "none",
         "returnStderr": true,
         "maxStdErr": 20000,
-        "stderrParser": {
+        "stderrHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "xmpDM:audioSampleRate": 
"\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*",
diff --git 
a/tika-serialization/src/test/resources/configs/external-parser-multi.json 
b/tika-serialization/src/test/resources/configs/external-parser-multi.json
index f1d5c5f59a..cce81a55fe 100644
--- a/tika-serialization/src/test/resources/configs/external-parser-multi.json
+++ b/tika-serialization/src/test/resources/configs/external-parser-multi.json
@@ -12,7 +12,8 @@
         "checkErrorCodes": [126, 127],
         "returnStderr": true,
         "maxStdErr": 20000,
-        "stderrParser": {
+        "contentSource": "none",
+        "stderrHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*",
@@ -30,7 +31,8 @@
         "commandLine": ["exiftool", "${INPUT_FILE}"],
         "checkCommandLine": ["exiftool", "-ver"],
         "checkErrorCodes": [126, 127],
-        "outputParser": {
+        "contentSource": "none",
+        "stdoutHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "exiftool:MIMEType": "^MIME Type\\s+: ([^\\r\\n]+)",
diff --git 
a/tika-serialization/src/test/resources/configs/external-parser-sox.json 
b/tika-serialization/src/test/resources/configs/external-parser-sox.json
index b8ae108d71..39ac93c334 100644
--- a/tika-serialization/src/test/resources/configs/external-parser-sox.json
+++ b/tika-serialization/src/test/resources/configs/external-parser-sox.json
@@ -16,7 +16,8 @@
         "checkErrorCodes": [126, 127],
         "returnStderr": true,
         "maxStdErr": 10000,
-        "stderrParser": {
+        "contentSource": "none",
+        "stderrHandler": {
           "regex-capture-parser": {
             "captureMap": {
               "xmpDM:audioChannelType": "\\s*Channels.*:\\s+(\\d+)\\s*",

Reply via email to