This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4716 in repository https://gitbox.apache.org/repos/asf/tika.git
commit 6bb95951d7de0c9750781df389fc16b3070bd785 Author: tallison <[email protected]> AuthorDate: Thu Apr 9 08:30:05 2026 -0400 TIKA-4716 -- refactor external parser for the three output options: stderr, stdout, file --- .../configuration/parsers/external-parser.adoc | 54 +++++-- .../tika/parser/external/ExternalParser.java | 173 +++++++++++++-------- .../tika/parser/external/ExternalParserConfig.java | 48 ++++-- .../tika/parser/external/ExternalParserTest.java | 50 +++--- .../configs/TIKA-3557-exiftool-example.json | 3 +- .../src/test/resources/configs/TIKA-3557.json | 2 +- .../configs/external-parser-exiftool.json | 3 +- .../resources/configs/external-parser-ffmpeg.json | 3 +- .../resources/configs/external-parser-multi.json | 6 +- .../resources/configs/external-parser-sox.json | 3 +- 10 files changed, 217 insertions(+), 128 deletions(-) diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc index 25af4049d7..281bd6d12d 100644 --- a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc +++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc @@ -30,17 +30,30 @@ Each external parser can declare a `checkCommandLine` that verifies the tool is installed. The check runs lazily on first use (not at startup), and if the tool is not found, the parser silently disables itself. -=== Output Parser vs Stderr Parser +=== Stream Handlers -External tools write useful output to different streams: +An external process produces up to three output streams. Each can have an +independent handler (any Tika parser): -* **`outputParser`** -- processes stdout (or the output file). Use this for tools - like `exiftool` that write structured output to stdout. -* **`stderrParser`** -- processes stderr. Use this for tools like `ffmpeg` and - `sox` that write metadata to stderr. +* **`stdoutHandler`** -- processes stdout +* **`stderrHandler`** -- processes stderr +* **`outputFileHandler`** -- processes the output file (when `${OUTPUT_FILE}` is used) -Both accept any Tika parser; `regex-capture-parser` is the most common choice -for extracting metadata via regex patterns. +Handlers extract metadata, content, or both. `regex-capture-parser` is the +most common choice for extracting metadata via regex patterns. + +=== Content Source + +The `contentSource` field controls which stream provides the XHTML text content: + +* `"stdout"` -- default when no `${OUTPUT_FILE}` in the command +* `"outputFile"` -- default when `${OUTPUT_FILE}` is in the command +* `"stderr"` -- use stderr as the content source +* `"none"` -- metadata-only mode, no text content extracted + +When a handler is configured for the content source stream, its +ContentHandler output becomes the XHTML content. When no handler is +configured, the raw bytes are written as text. == Configuration Options @@ -60,13 +73,21 @@ for extracting metadata via regex patterns. |`List<Integer>` |Exit codes that indicate the tool is not available. Default: `[127]`. -|`outputParser` +|`stdoutHandler` |Parser config -|Optional. Parser to process stdout or the output file. +|Optional. Parser to process stdout. -|`stderrParser` +|`stderrHandler` |Parser config -|Optional. Parser to process stderr (for metadata extraction). +|Optional. Parser to process stderr. + +|`outputFileHandler` +|Parser config +|Optional. Parser to process the output file. + +|`contentSource` +|`String` +|Which stream provides XHTML content: `"stdout"`, `"stderr"`, `"outputFile"`, or `"none"`. Default depends on command. |`returnStdout` |`boolean` @@ -93,7 +114,7 @@ for extracting metadata via regex patterns. === Exiftool (metadata from stdout) -Extracts metadata from media files using `exiftool`. The `outputParser` uses +Extracts metadata from media files using `exiftool`. The `stdoutHandler` uses `regex-capture-parser` to extract key-value pairs from exiftool's stdout. [source,json] @@ -105,7 +126,7 @@ icon:github[] https://github.com/apache/tika/blob/main/tika-serialization/src/te === FFmpeg (metadata from stderr) Extracts audio/video metadata from `ffmpeg -i` output. FFmpeg writes metadata -to stderr, so this uses `stderrParser` instead of `outputParser`. +to stderr, so this uses `stderrHandler`. [source,json] ---- @@ -146,8 +167,9 @@ In Tika 4.x: * External parsers must be explicitly configured in JSON -- no auto-discovery. * The `checkCommandLine` runs lazily on first use, not at startup. -* The `stderrParser` field replaces the inline regex-on-stderr metadata extraction. -* The `external2` package has been renamed back to `external`. +* Three independent stream handlers (`stdoutHandler`, `stderrHandler`, + `outputFileHandler`) replace the old `outputParser`/`stderrParser` split. +* The `contentSource` field explicitly controls which stream provides text content. * `CompositeExternalParser`, `ExternalParsersFactory`, and the XML config reader have been removed. diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java index d6519fedb2..b66b468868 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java @@ -45,7 +45,6 @@ import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.ExternalProcess; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; @@ -58,10 +57,15 @@ import org.apache.tika.utils.ProcessUtils; * to extract text content and metadata from a given document. * <p> * This parser relies on JSON configuration rather than classpath auto-discovery. - * Users can specify a parser to handle the output of the external process - * (via {@code outputParser}) and/or a parser to extract metadata from stderr - * (via {@code stderrParser}). An optional {@code checkCommandLine} can be - * configured so that the parser lazily verifies the external tool is available. + * Users can specify independent handlers for each process stream: + * <ul> + * <li>{@code stdoutHandler} — processes stdout</li> + * <li>{@code stderrHandler} — processes stderr</li> + * <li>{@code outputFileHandler} — processes the output file</li> + * </ul> + * The {@code contentSource} field controls which stream provides the XHTML + * content output. An optional {@code checkCommandLine} lazily verifies the + * external tool is available. */ @TikaComponent public class ExternalParser implements Parser { @@ -72,18 +76,24 @@ public class ExternalParser implements Parser { public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}"; - private static Pattern INPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{INPUT_FILE}"); - private static Pattern OUTPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{OUTPUT_FILE}"); + private static final Pattern INPUT_TOKEN_MATCHER = + Pattern.compile("\\$\\{INPUT_FILE}"); + private static final Pattern OUTPUT_TOKEN_MATCHER = + Pattern.compile("\\$\\{OUTPUT_FILE}"); private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class); + private static final ContentHandler DISCARD_HANDLER = + new org.xml.sax.helpers.DefaultHandler(); + private final ExternalParserConfig config; // Cached values derived from config private final Set<MediaType> supportedTypes; private final List<String> commandLine; - private final Parser outputParser; - private final Parser stderrParser; + private final Parser stdoutHandler; + private final Parser stderrHandler; + private final Parser outputFileHandler; // Lazy check state private final String[] checkCmd; @@ -107,14 +117,15 @@ public class ExternalParser implements Parser { this.supportedTypes.add(MediaType.parse(s)); } this.commandLine = new ArrayList<>(config.getCommandLine()); - this.outputParser = config.getOutputParser() != null ? - config.getOutputParser() : EmptyParser.INSTANCE; - this.stderrParser = config.getStderrParser(); + this.stdoutHandler = config.getStdoutHandler(); + this.stderrHandler = config.getStderrHandler(); + this.outputFileHandler = config.getOutputFileHandler(); // Set up lazy check if (config.getCheckCommandLine() != null && !config.getCheckCommandLine().isEmpty()) { this.checkCmd = config.getCheckCommandLine().toArray(new String[0]); - if (config.getCheckErrorCodes() != null && !config.getCheckErrorCodes().isEmpty()) { + if (config.getCheckErrorCodes() != null && + !config.getCheckErrorCodes().isEmpty()) { this.checkErrorCodes = config.getCheckErrorCodes().stream() .mapToInt(Integer::intValue).toArray(); } else { @@ -150,39 +161,39 @@ public class ExternalParser implements Parser { @Override public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - //this may remain null, depending on whether the external parser writes to a file Path outFile = null; try (TemporaryResources tmp = new TemporaryResources()) { Path p = tis.getPath(); List<String> thisCommandLine = new ArrayList<>(); Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher(""); Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher(""); - boolean outputFileInCommandline = false; + boolean hasOutputFile = false; for (String c : commandLine) { if (inputMatcher.reset(c).find()) { String updated = c.replace(INPUT_FILE_TOKEN, - ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString())); + ProcessUtils.escapeCommandLine( + p.toAbsolutePath().toString())); thisCommandLine.add(updated); } else if (outputMatcher.reset(c).find()) { outFile = Files.createTempFile("tika-external-", ""); String updated = c.replace(OUTPUT_FILE_TOKEN, - ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString())); + ProcessUtils.escapeCommandLine( + outFile.toAbsolutePath().toString())); thisCommandLine.add(updated); - outputFileInCommandline = true; + hasOutputFile = true; } else { thisCommandLine.add(c); } } - FileProcessResult result = null; - long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis(context, config.getTimeoutMs()); - if (outputFileInCommandline) { - result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), - localTimeoutMillis, config.getMaxStdOut(), config.getMaxStdErr()); - } else { - outFile = Files.createTempFile("tika-external-", ""); - result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), - localTimeoutMillis, outFile, config.getMaxStdErr()); - } + + // Always capture both stdout and stderr in memory + long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis( + context, config.getTimeoutMs()); + FileProcessResult result = ProcessUtils.execute( + new ProcessBuilder(thisCommandLine), + localTimeoutMillis, config.getMaxStdOut(), config.getMaxStdErr()); + + // Set process metadata metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); TikaProgressTracker.update(context); @@ -199,17 +210,32 @@ public class ExternalParser implements Parser { if (config.isReturnStderr()) { metadata.set(ExternalProcess.STD_ERR, result.getStderr()); } - if (stderrParser != null && result.getStderr() != null - && !result.getStderr().isEmpty()) { - try (TikaInputStream stderrStream = TikaInputStream.get( - result.getStderr().getBytes(StandardCharsets.UTF_8))) { - stderrParser.parse(stderrStream, new org.xml.sax.helpers.DefaultHandler(), - metadata, context); - } + + // Determine content source + String effectiveContentSource = config.getContentSource(); + if (effectiveContentSource == null) { + effectiveContentSource = hasOutputFile ? "outputFile" : "stdout"; } - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + + XHTMLContentHandler xhtml = + new XHTMLContentHandler(handler, metadata, context); xhtml.startDocument(); - handleOutput(result, outFile, xhtml, metadata, context); + + // Process each stream through its handler + handleStream(result.getStdout(), stdoutHandler, + "stdout".equals(effectiveContentSource), + xhtml, metadata, context); + + handleStream(result.getStderr(), stderrHandler, + "stderr".equals(effectiveContentSource), + xhtml, metadata, context); + + if (hasOutputFile && outFile != null) { + handleOutputFile(outFile, outputFileHandler, + "outputFile".equals(effectiveContentSource), + xhtml, metadata, context); + } + xhtml.endDocument(); } finally { if (outFile != null) { @@ -218,45 +244,54 @@ public class ExternalParser implements Parser { } } - private void handleOutput(FileProcessResult result, Path outFile, + private void handleStream(String content, Parser handler, boolean isContentSource, XHTMLContentHandler xhtml, Metadata metadata, - ParseContext parseContext) throws SAXException, TikaException, - IOException { - if (outputParser == EmptyParser.INSTANCE) { - if (outFile != null) { - try (BufferedReader reader = Files.newBufferedReader(outFile)) { - String line = reader.readLine(); - while (line != null) { - //do we want to wrap this in <p></p> elements? - xhtml.characters(line); - xhtml.newline(); - line = reader.readLine(); - } - } - } else { - //read this in line by line and wrap <p></p> elements? - xhtml.characters(result.getStdout()); + ParseContext context) + throws IOException, SAXException, TikaException { + if (content == null || content.isEmpty()) { + return; + } + if (handler != null) { + ContentHandler target = isContentSource ? + new BodyContentHandler(xhtml) : DISCARD_HANDLER; + try (TikaInputStream tis = TikaInputStream.get( + content.getBytes(StandardCharsets.UTF_8))) { + handler.parse(tis, target, metadata, context); } - } else { - if (outFile != null) { - try (TikaInputStream tis = TikaInputStream.get(outFile)) { - outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); - } - } else { - try (TikaInputStream tis = TikaInputStream.get( - result.getStdout().getBytes(StandardCharsets.UTF_8))) { - outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); + } else if (isContentSource) { + // No handler — write raw content as XHTML text + String[] lines = content.split("\n", -1); + for (int i = 0; i < lines.length; i++) { + xhtml.characters(lines[i]); + if (i < lines.length - 1) { + xhtml.newline(); } } } - } - /** - * Returns the output parser used to parse the external process output. - */ - public Parser getOutputParser() { - return outputParser; + private void handleOutputFile(Path outFile, Parser handler, + boolean isContentSource, + XHTMLContentHandler xhtml, Metadata metadata, + ParseContext context) + throws IOException, SAXException, TikaException { + if (handler != null) { + ContentHandler target = isContentSource ? + new BodyContentHandler(xhtml) : DISCARD_HANDLER; + try (TikaInputStream tis = TikaInputStream.get(outFile)) { + handler.parse(tis, target, metadata, context); + } + } else if (isContentSource) { + // No handler — write raw file content as XHTML text + try (BufferedReader reader = Files.newBufferedReader(outFile)) { + String line = reader.readLine(); + while (line != null) { + xhtml.characters(line); + xhtml.newline(); + line = reader.readLine(); + } + } + } } /** diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java index b15c5307e0..7a91300701 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java @@ -30,12 +30,14 @@ import org.apache.tika.parser.Parser; */ public class ExternalParserConfig implements Serializable { - private static final long serialVersionUID = 1L; + private static final long serialVersionUID = 2L; private List<String> supportedTypes = new ArrayList<>(); private List<String> commandLine = new ArrayList<>(); - private Parser outputParser; - private Parser stderrParser; + private Parser stdoutHandler; + private Parser stderrHandler; + private Parser outputFileHandler; + private String contentSource; private List<String> checkCommandLine; private List<Integer> checkErrorCodes; private boolean returnStdout = false; @@ -63,20 +65,44 @@ public class ExternalParserConfig implements Serializable { this.commandLine = commandLine; } - public Parser getOutputParser() { - return outputParser; + public Parser getStdoutHandler() { + return stdoutHandler; } - public void setOutputParser(Parser outputParser) { - this.outputParser = outputParser; + public void setStdoutHandler(Parser stdoutHandler) { + this.stdoutHandler = stdoutHandler; } - public Parser getStderrParser() { - return stderrParser; + public Parser getStderrHandler() { + return stderrHandler; } - public void setStderrParser(Parser stderrParser) { - this.stderrParser = stderrParser; + public void setStderrHandler(Parser stderrHandler) { + this.stderrHandler = stderrHandler; + } + + public Parser getOutputFileHandler() { + return outputFileHandler; + } + + public void setOutputFileHandler(Parser outputFileHandler) { + this.outputFileHandler = outputFileHandler; + } + + /** + * Which stream provides the XHTML content output. + * <p> + * Valid values: {@code "stdout"}, {@code "stderr"}, {@code "outputFile"}, {@code "none"}. + * <p> + * If {@code null}, defaults to {@code "stdout"} when no {@code ${OUTPUT_FILE}} token + * is in the command, or {@code "outputFile"} when it is. + */ + public String getContentSource() { + return contentSource; + } + + public void setContentSource(String contentSource) { + this.contentSource = contentSource; } public List<String> getCheckCommandLine() { diff --git a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java index 7cdbca5586..e8c9293d57 100644 --- a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java +++ b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java @@ -18,6 +18,7 @@ package org.apache.tika.parser.external; import static org.junit.jupiter.api.Assertions.assertEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertNull; import static org.junit.jupiter.api.Assertions.assertTrue; import static org.junit.jupiter.api.Assumptions.assumeTrue; @@ -37,6 +38,7 @@ import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.ParserDecorator; import org.apache.tika.parser.RegexCaptureParser; +import org.apache.tika.parser.RegexCaptureParserConfig; public class ExternalParserTest extends TikaTest { @@ -49,13 +51,12 @@ public class ExternalParserTest extends TikaTest { CompositeParser p = (CompositeParser) loader.get(Parser.class); assertEquals(1, p.getAllComponentParsers().size()); Parser parser = p.getAllComponentParsers().get(0); - // When _mime-include is used, the parser is wrapped in a ParserDecorator ExternalParser externalParser = (parser instanceof ParserDecorator) ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() : (ExternalParser) parser; - Parser outputParser = externalParser.getOutputParser(); - assertEquals(RegexCaptureParser.class, outputParser.getClass()); + Parser stdoutHandler = externalParser.getConfig().getStdoutHandler(); + assertEquals(RegexCaptureParser.class, stdoutHandler.getClass()); Metadata m = new Metadata(); ContentHandler contentHandler = new DefaultHandler(); @@ -64,34 +65,37 @@ public class ExternalParserTest extends TikaTest { "Author: jumped over\n" + "Created: 10/20/2024"; try (TikaInputStream tis = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { - outputParser.parse(tis, contentHandler, m, new ParseContext()); + stdoutHandler.parse(tis, contentHandler, m, new ParseContext()); } assertEquals("the quick brown fox", m.get("title")); } @Test public void testConfigBasic() throws Exception { - assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new String[]{"file", "--version"})); - TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-no-output-parser.json")); + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand( + new String[]{"file", "--version"})); + TikaLoader loader = TikaLoader.load( + getConfigPath(getClass(), "TIKA-3557-no-output-parser.json")); CompositeParser p = (CompositeParser) loader.get(Parser.class); assertEquals(1, p.getAllComponentParsers().size()); Parser parser = p.getAllComponentParsers().get(0); - // When _mime-include is used, the parser is wrapped in a ParserDecorator ExternalParser externalParser = (parser instanceof ParserDecorator) ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() : (ExternalParser) parser; + // No handler — raw stdout becomes content (default contentSource=stdout) + assertNull(externalParser.getConfig().getStdoutHandler()); XMLResult xmlResult = getXML("example.xml", externalParser); - assertContains("<body>text/xml</body>", xmlResult.xml.replaceAll("[\r\n]", "")); + assertContains("text/xml", xmlResult.xml); } @Test public void testExifTool() throws Exception { - assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new String[]{"exiftool", - "-ver"})); - TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-exiftool-example.json")); + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand( + new String[]{"exiftool", "-ver"})); + TikaLoader loader = TikaLoader.load( + getConfigPath(getClass(), "TIKA-3557-exiftool-example.json")); Parser p = loader.loadAutoDetectParser(); - //this was the smallest pdf we had List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", p); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); @@ -115,8 +119,9 @@ public class ExternalParserTest extends TikaTest { assertNotNull(config.getCheckCommandLine()); assertEquals(List.of("ffmpeg", "-version"), config.getCheckCommandLine()); assertEquals(List.of(126, 127), config.getCheckErrorCodes()); - assertNotNull(config.getStderrParser()); - assertEquals(RegexCaptureParser.class, config.getStderrParser().getClass()); + assertNotNull(config.getStderrHandler()); + assertEquals(RegexCaptureParser.class, config.getStderrHandler().getClass()); + assertEquals("none", config.getContentSource()); assertTrue(config.isReturnStderr()); } @@ -134,27 +139,22 @@ public class ExternalParserTest extends TikaTest { ExternalParserConfig config = externalParser.getConfig(); assertNotNull(config.getCheckCommandLine()); assertEquals(List.of("sox", "--version"), config.getCheckCommandLine()); - assertNotNull(config.getStderrParser()); - assertEquals(RegexCaptureParser.class, config.getStderrParser().getClass()); + assertNotNull(config.getStderrHandler()); + assertEquals(RegexCaptureParser.class, config.getStderrHandler().getClass()); } @Test - public void testStderrParserExtractsMetadata() throws Exception { - // Simulate what would happen with ffmpeg stderr output - RegexCaptureParser stderrParser = new RegexCaptureParser(); - // Build a config with a captureMap programmatically + public void testStderrHandlerExtractsMetadata() throws Exception { String ffmpegStderr = " Duration: 00:02:30.50, start: 0.000000, bitrate: 706 kb/s\n" + " Stream #0:0: Video: h264 (High), yuv420p, 1280x720, 25 fps\n" + " Stream #0:1: Audio: aac, 44100 Hz, 2 channels, fltp\n"; - // Use the regex-capture-parser with the same patterns from the ffmpeg config java.util.Map<String, String> captureMap = new java.util.LinkedHashMap<>(); captureMap.put("xmpDM:duration", "\\s*Duration:\\s*([0-9:\\.]+),.*"); captureMap.put("xmpDM:audioSampleRate", "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*"); - org.apache.tika.parser.RegexCaptureParserConfig regexConfig = - new org.apache.tika.parser.RegexCaptureParserConfig(); + RegexCaptureParserConfig regexConfig = new RegexCaptureParserConfig(); regexConfig.setCaptureMap(captureMap); RegexCaptureParser parser = new RegexCaptureParser(regexConfig); @@ -179,13 +179,13 @@ public class ExternalParserTest extends TikaTest { CompositeParser composite = (CompositeParser) loader.get(Parser.class); List<Parser> allParsers = composite.getAllComponentParsers(); - // Should have two separate external parser instances assertEquals(2, allParsers.size(), "Expected 2 external parsers but got " + allParsers.size() + ": " + allParsers); // Test the exiftool parser on a PDF Parser autoDetect = loader.loadAutoDetectParser(); - List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", autoDetect); + List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", + autoDetect); assertEquals(1, metadataList.size()); Metadata m = metadataList.get(0); assertEquals("application/pdf", m.get("exiftool:MIMEType")); diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json index 8e61008654..73ed0ba1ef 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json @@ -4,7 +4,8 @@ "external-parser": { "_mime-include": ["application/octet-stream"], "commandLine": ["exiftool", "${INPUT_FILE}"], - "outputParser": { + "contentSource": "none", + "stdoutHandler": { "regex-capture-parser": { "captureMap": { "mime": "^MIME Type\\s+: ([^\\r\\n]+)", diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557.json b/tika-serialization/src/test/resources/configs/TIKA-3557.json index cd3af89821..a5937bedf6 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3557.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3557.json @@ -4,7 +4,7 @@ "external-parser": { "_mime-include": ["application/xml"], "commandLine": ["file", "-b", "--mime-type", "${INPUT_FILE}"], - "outputParser": { + "stdoutHandler": { "regex-capture-parser": { "captureMap": { "title": "^Title: ([^\\r\\n]+)" diff --git a/tika-serialization/src/test/resources/configs/external-parser-exiftool.json b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json index a0bffc78e9..fbbed46e68 100644 --- a/tika-serialization/src/test/resources/configs/external-parser-exiftool.json +++ b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json @@ -11,7 +11,8 @@ "commandLine": ["exiftool", "${INPUT_FILE}"], "checkCommandLine": ["exiftool", "-ver"], "checkErrorCodes": [126, 127], - "outputParser": { + "contentSource": "none", + "stdoutHandler": { "regex-capture-parser": { "captureMap": { "mime": "^MIME Type\\s+: ([^\\r\\n]+)", diff --git a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json index 0aa55a5fbc..3d9dd70bba 100644 --- a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json +++ b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json @@ -10,9 +10,10 @@ "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"], "checkCommandLine": ["ffmpeg", "-version"], "checkErrorCodes": [126, 127], + "contentSource": "none", "returnStderr": true, "maxStdErr": 20000, - "stderrParser": { + "stderrHandler": { "regex-capture-parser": { "captureMap": { "xmpDM:audioSampleRate": "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*", diff --git a/tika-serialization/src/test/resources/configs/external-parser-multi.json b/tika-serialization/src/test/resources/configs/external-parser-multi.json index f1d5c5f59a..cce81a55fe 100644 --- a/tika-serialization/src/test/resources/configs/external-parser-multi.json +++ b/tika-serialization/src/test/resources/configs/external-parser-multi.json @@ -12,7 +12,8 @@ "checkErrorCodes": [126, 127], "returnStderr": true, "maxStdErr": 20000, - "stderrParser": { + "contentSource": "none", + "stderrHandler": { "regex-capture-parser": { "captureMap": { "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*", @@ -30,7 +31,8 @@ "commandLine": ["exiftool", "${INPUT_FILE}"], "checkCommandLine": ["exiftool", "-ver"], "checkErrorCodes": [126, 127], - "outputParser": { + "contentSource": "none", + "stdoutHandler": { "regex-capture-parser": { "captureMap": { "exiftool:MIMEType": "^MIME Type\\s+: ([^\\r\\n]+)", diff --git a/tika-serialization/src/test/resources/configs/external-parser-sox.json b/tika-serialization/src/test/resources/configs/external-parser-sox.json index b8ae108d71..39ac93c334 100644 --- a/tika-serialization/src/test/resources/configs/external-parser-sox.json +++ b/tika-serialization/src/test/resources/configs/external-parser-sox.json @@ -16,7 +16,8 @@ "checkErrorCodes": [126, 127], "returnStderr": true, "maxStdErr": 10000, - "stderrParser": { + "contentSource": "none", + "stderrHandler": { "regex-capture-parser": { "captureMap": { "xmpDM:audioChannelType": "\\s*Channels.*:\\s+(\\d+)\\s*",
