This is an automated email from the ASF dual-hosted git repository. tallison pushed a commit to branch TIKA-4714-external-parsers in repository https://gitbox.apache.org/repos/asf/tika.git
commit 4f21319577fa917e131b8a6d3f0e6eecd6684844 Author: tallison <[email protected]> AuthorDate: Thu Apr 9 08:15:07 2026 -0400 TIKA-4717 - promote external parser v2 --- .github/workflows/main-jdk17-build.yml | 2 + CHANGES.txt | 3 + .../ROOT/examples/external-parser-exiftool.json | 1 + .../ROOT/examples/external-parser-ffmpeg.json | 1 + .../ROOT/examples/external-parser-multi.json | 1 + .../modules/ROOT/examples/external-parser-sox.json | 1 + docs/modules/ROOT/nav.adoc | 1 + .../configuration/parsers/external-parser.adoc | 154 ++++++ .../services/org.apache.tika.parser.Parser | 16 - .../apache/tika/detect/FileCommandDetector.java | 3 +- .../org/apache/tika/embedder/ExternalEmbedder.java | 13 +- .../parser/external/CompositeExternalParser.java | 44 -- .../tika/parser/external/ExternalParser.java | 560 +++++++-------------- .../ExternalParserConfig.java | 29 +- .../external/ExternalParsersConfigReader.java | 223 -------- .../ExternalParsersConfigReaderMetKeys.java | 43 -- .../parser/external/ExternalParsersFactory.java | 67 --- .../apache/tika/parser/external/package-info.java | 22 - .../tika/parser/external2/ExternalParser.java | 227 --------- .../java/org/apache/tika/utils/ProcessUtils.java | 74 +++ .../tika/parser/external/tika-external-parsers.xml | 117 ----- .../tika/detect/siegfried/SiegfriedDetector.java | 3 +- .../org/apache/tika/parser/gdal/GDALParser.java | 5 +- .../apache/tika/parser/gdal/TestGDALParser.java | 6 +- .../parser/scientific/integration/TestParsers.java | 11 - .../apache/tika/parser/dwg/DWGParserConfig.java | 4 +- .../org/apache/tika/parser/dwg/DWGParserTest.java | 4 +- .../apache/tika/parser/ocr/TesseractOCRParser.java | 6 +- .../renderer/pdf/poppler/PopplerRendererTest.java | 4 +- .../apache/tika/parser/pkg/UnrarParserTest.java | 4 +- .../apache/tika/parser/strings/StringsParser.java | 6 +- .../tika/parser/strings/StringsParserTest.java | 4 +- .../services/org.apache.tika.parser.Parser | 16 - .../apache/tika/parser/AutoDetectParserTest.java | 6 - .../org/apache/tika/parser/pdf/PDFParserTest.java | 4 +- .../apache/tika/parser/pkg/UnrarParserTest.java | 4 +- .../tika/parser/external/ExternalParserTest.java | 195 +++++++ .../tika/parser/external2/ExternalParserTest.java | 100 ---- .../configs/TIKA-3557-exiftool-example.json | 3 +- ...-example.json => external-parser-exiftool.json} | 12 +- .../resources/configs/external-parser-ffmpeg.json | 34 ++ .../resources/configs/external-parser-multi.json | 45 ++ .../resources/configs/external-parser-sox.json | 36 ++ .../services/org.apache.tika.parser.Parser | 16 - .../tika/server/standard/TikaParsersTest.java | 4 +- 45 files changed, 798 insertions(+), 1336 deletions(-) diff --git a/.github/workflows/main-jdk17-build.yml b/.github/workflows/main-jdk17-build.yml index 75c923c10e..3f60fd98a6 100644 --- a/.github/workflows/main-jdk17-build.yml +++ b/.github/workflows/main-jdk17-build.yml @@ -43,6 +43,8 @@ jobs: distribution: 'temurin' java-version: ${{ matrix.java }} cache: 'maven' + - name: Install external tools + run: sudo apt-get update && sudo apt-get install -y ffmpeg libimage-exiftool-perl - name: Build with Maven run: mvn clean apache-rat:check test install javadoc:aggregate -Pci -B "-Dorg.slf4j.simpleLogger.log.org.apache.maven.cli.transfer.Slf4jMavenTransferListener=warn" diff --git a/CHANGES.txt b/CHANGES.txt index b135cb3894..ada77669ba 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -29,6 +29,9 @@ Release 4.0.0-BETA1 - ??? * Removed DigestingParser (TIKA-4607). + * Removed legacy ExternalParser; external parsers now require explicit + JSON configuration (TIKA-4707). + OTHER CHANGES * Fix concurrency bug in TikaToXMP (TIKA-4393) diff --git a/docs/modules/ROOT/examples/external-parser-exiftool.json b/docs/modules/ROOT/examples/external-parser-exiftool.json new file mode 120000 index 0000000000..145dcb2515 --- /dev/null +++ b/docs/modules/ROOT/examples/external-parser-exiftool.json @@ -0,0 +1 @@ +../../../../tika-serialization/src/test/resources/configs/external-parser-exiftool.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/external-parser-ffmpeg.json b/docs/modules/ROOT/examples/external-parser-ffmpeg.json new file mode 120000 index 0000000000..024b6de0af --- /dev/null +++ b/docs/modules/ROOT/examples/external-parser-ffmpeg.json @@ -0,0 +1 @@ +../../../../tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/external-parser-multi.json b/docs/modules/ROOT/examples/external-parser-multi.json new file mode 120000 index 0000000000..9fd360037c --- /dev/null +++ b/docs/modules/ROOT/examples/external-parser-multi.json @@ -0,0 +1 @@ +../../../../tika-serialization/src/test/resources/configs/external-parser-multi.json \ No newline at end of file diff --git a/docs/modules/ROOT/examples/external-parser-sox.json b/docs/modules/ROOT/examples/external-parser-sox.json new file mode 120000 index 0000000000..1c996f4169 --- /dev/null +++ b/docs/modules/ROOT/examples/external-parser-sox.json @@ -0,0 +1 @@ +../../../../tika-serialization/src/test/resources/configs/external-parser-sox.json \ No newline at end of file diff --git a/docs/modules/ROOT/nav.adoc b/docs/modules/ROOT/nav.adoc index 1702591425..ea3e9726a9 100644 --- a/docs/modules/ROOT/nav.adoc +++ b/docs/modules/ROOT/nav.adoc @@ -28,6 +28,7 @@ ** xref:configuration/parsers/pdf-parser.adoc[PDF Parser] ** xref:configuration/parsers/tesseract-ocr-parser.adoc[Tesseract OCR] ** xref:configuration/parsers/vlm-parsers.adoc[VLM Parsers (Claude, Gemini, OpenAI)] +** xref:configuration/parsers/external-parser.adoc[External Parser (ffmpeg, exiftool, etc.)] ** xref:configuration/parsers/tess4j-parser.adoc[Tess4J OCR (In-Process)] * xref:migration-to-4x/index.adoc[Migration to 4.x] ** xref:migration-to-4x/migrating-to-4x.adoc[Migration Guide] diff --git a/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc new file mode 100644 index 0000000000..25af4049d7 --- /dev/null +++ b/docs/modules/ROOT/pages/configuration/parsers/external-parser.adoc @@ -0,0 +1,154 @@ +// +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + += External Parser Configuration + +The `ExternalParser` allows Tika to delegate parsing to external command-line +programs such as `ffmpeg`, `exiftool`, or `sox`. Each external parser is +configured via JSON and must be explicitly enabled -- Tika 4.x does not +auto-discover external tools at startup. + +== Key Concepts + +=== Lazy Check + +Each external parser can declare a `checkCommandLine` that verifies the tool +is installed. The check runs lazily on first use (not at startup), and if the +tool is not found, the parser silently disables itself. + +=== Output Parser vs Stderr Parser + +External tools write useful output to different streams: + +* **`outputParser`** -- processes stdout (or the output file). Use this for tools + like `exiftool` that write structured output to stdout. +* **`stderrParser`** -- processes stderr. Use this for tools like `ffmpeg` and + `sox` that write metadata to stderr. + +Both accept any Tika parser; `regex-capture-parser` is the most common choice +for extracting metadata via regex patterns. + +== Configuration Options + +[cols="1,1,3"] +|=== +|Field |Type |Description + +|`commandLine` +|`List<String>` +|The command and arguments to run. Use `${INPUT_FILE}` and `${OUTPUT_FILE}` tokens for file paths. + +|`checkCommandLine` +|`List<String>` +|Optional. Command to verify the tool is installed (e.g., `["ffmpeg", "-version"]`). + +|`checkErrorCodes` +|`List<Integer>` +|Exit codes that indicate the tool is not available. Default: `[127]`. + +|`outputParser` +|Parser config +|Optional. Parser to process stdout or the output file. + +|`stderrParser` +|Parser config +|Optional. Parser to process stderr (for metadata extraction). + +|`returnStdout` +|`boolean` +|Store raw stdout in metadata. Default: `false`. + +|`returnStderr` +|`boolean` +|Store raw stderr in metadata. Default: `true`. + +|`timeoutMs` +|`long` +|Process timeout in milliseconds. Default: `60000`. + +|`maxStdOut` +|`int` +|Maximum stdout bytes to capture. Default: `10000`. + +|`maxStdErr` +|`int` +|Maximum stderr bytes to capture. Default: `10000`. +|=== + +== Examples + +=== Exiftool (metadata from stdout) + +Extracts metadata from media files using `exiftool`. The `outputParser` uses +`regex-capture-parser` to extract key-value pairs from exiftool's stdout. + +[source,json] +---- +include::example$external-parser-exiftool.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-exiftool.json[View source on GitHub] + +=== FFmpeg (metadata from stderr) + +Extracts audio/video metadata from `ffmpeg -i` output. FFmpeg writes metadata +to stderr, so this uses `stderrParser` instead of `outputParser`. + +[source,json] +---- +include::example$external-parser-ffmpeg.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json[View source on GitHub] + +=== Sox (audio metadata from stderr) + +Extracts audio metadata using `sox --info`. Like FFmpeg, Sox writes to stderr. + +[source,json] +---- +include::example$external-parser-sox.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-sox.json[View source on GitHub] + +=== Multiple External Parsers + +You can configure multiple external parsers in a single config file. Each +handles different MIME types via `_mime-include`. Here FFmpeg handles video +files while exiftool handles PDFs: + +[source,json] +---- +include::example$external-parser-multi.json[] +---- +icon:github[] https://github.com/apache/tika/blob/main/tika-serialization/src/test/resources/configs/external-parser-multi.json[View source on GitHub] + +== Changes from 3.x + +In Tika 3.x, external parsers were configured via XML (`tika-external-parsers.xml`) +and auto-discovered at startup. The `CompositeExternalParser` would fork +a process for each configured tool on every Tika initialization to check +if the tool was available. + +In Tika 4.x: + +* External parsers must be explicitly configured in JSON -- no auto-discovery. +* The `checkCommandLine` runs lazily on first use, not at startup. +* The `stderrParser` field replaces the inline regex-on-stderr metadata extraction. +* The `external2` package has been renamed back to `external`. +* `CompositeExternalParser`, `ExternalParsersFactory`, and the XML config + reader have been removed. + +See xref:migration-to-4x/migrating-to-4x.adoc[Migrating to 4.x] for general migration guidance. diff --git a/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser deleted file mode 100644 index 37f87a4595..0000000000 --- a/tika-app/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.parser.external.CompositeExternalParser \ No newline at end of file diff --git a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java index 83182eafc4..1433bc73ef 100644 --- a/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java +++ b/tika-core/src/main/java/org/apache/tika/detect/FileCommandDetector.java @@ -29,7 +29,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; @@ -74,7 +73,7 @@ public class FileCommandDetector implements Detector { public static boolean checkHasFile(String fileCommandPath) { String[] commandline = new String[]{fileCommandPath, "-v"}; - return ExternalParser.check(commandline); + return ProcessUtils.checkCommand(commandline); } /** diff --git a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java index c58d57345f..4cdfbea850 100644 --- a/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java +++ b/tika-core/src/main/java/org/apache/tika/embedder/ExternalEmbedder.java @@ -40,7 +40,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; /** * Embedder that uses an external program (like sed or exiftool) to embed text @@ -50,6 +49,8 @@ import org.apache.tika.parser.external.ExternalParser; */ public class ExternalEmbedder implements Embedder { + public static final String INPUT_FILE_TOKEN = "${INPUT}"; + public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}"; /** * Token to be replaced with a String array of metadata assignment command * arguments @@ -78,7 +79,7 @@ public class ExternalEmbedder implements Embedder { */ private String[] command = new String[]{"sed", "-e", "$a\\\n" + METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN, - ExternalParser.INPUT_FILE_TOKEN}; + INPUT_FILE_TOKEN}; private String commandAssignmentOperator = "="; private String commandAssignmentDelimeter = ", "; private String commandAppendOperator = "="; @@ -355,15 +356,15 @@ public class ExternalEmbedder implements Embedder { String[] origCmd = command; List<String> cmd = new ArrayList<>(); for (String commandSegment : origCmd) { - if (commandSegment.contains(ExternalParser.INPUT_FILE_TOKEN)) { - commandSegment = commandSegment.replace(ExternalParser.INPUT_FILE_TOKEN, + if (commandSegment.contains(INPUT_FILE_TOKEN)) { + commandSegment = commandSegment.replace(INPUT_FILE_TOKEN, tikaInputStream.getFile().toString()); inputToStdIn = false; } - if (commandSegment.contains(ExternalParser.OUTPUT_FILE_TOKEN)) { + if (commandSegment.contains(OUTPUT_FILE_TOKEN)) { tempOutputFile = tmp.createTemporaryFile(); commandSegment = commandSegment - .replace(ExternalParser.OUTPUT_FILE_TOKEN, tempOutputFile.toString()); + .replace(OUTPUT_FILE_TOKEN, tempOutputFile.toString()); outputFromStdOut = false; } if (commandSegment.contains(METADATA_COMMAND_ARGUMENTS_SERIALIZED_TOKEN)) { diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java deleted file mode 100644 index 53cb7b7eac..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external/CompositeExternalParser.java +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external; - -import java.io.IOException; -import java.util.List; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.mime.MediaTypeRegistry; -import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.Parser; - -/** - * A Composite Parser that wraps up all the available External Parsers, - * and provides an easy way to access them. - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. - */ -public class CompositeExternalParser extends CompositeParser { - private static final long serialVersionUID = 6962436916649024024L; - - public CompositeExternalParser() throws IOException, TikaException { - this(new MediaTypeRegistry()); - } - - @SuppressWarnings("unchecked") - public CompositeExternalParser(MediaTypeRegistry registry) throws IOException, TikaException { - super(registry, (List<Parser>) (List<? extends Parser>) ExternalParsersFactory.create()); - } -} diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java index 0e17384928..d6519fedb2 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParser.java @@ -16,461 +16,253 @@ */ package org.apache.tika.parser.external; -import static java.nio.charset.StandardCharsets.UTF_8; - import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; import java.io.IOException; -import java.io.InputStream; -import java.io.InputStreamReader; -import java.io.OutputStream; -import java.io.Reader; -import java.io.Serializable; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; import java.util.Collections; import java.util.HashSet; -import java.util.Map; +import java.util.List; import java.util.Set; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.TimeoutException; import java.util.regex.Matcher; import java.util.regex.Pattern; -import org.apache.commons.io.IOUtils; -import org.apache.commons.io.output.NullOutputStream; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; +import org.apache.tika.config.ConfigDeserializer; +import org.apache.tika.config.JsonConfig; +import org.apache.tika.config.TikaComponent; +import org.apache.tika.config.TikaProgressTracker; +import org.apache.tika.config.TimeoutLimits; import org.apache.tika.exception.TikaException; import org.apache.tika.io.TemporaryResources; import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.ExternalProcess; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.EmptyParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.FileProcessResult; +import org.apache.tika.utils.ProcessUtils; /** - * Parser that uses an external program (like catdoc or pdf2txt) to extract - * text content and metadata from a given document. - * - * @deprecated Use {@link org.apache.tika.parser.external2.ExternalParser} instead. - * This class will be removed in a future version of Tika. + * Parser that uses an external program (like ffmpeg, exiftool or sox) + * to extract text content and metadata from a given document. + * <p> + * This parser relies on JSON configuration rather than classpath auto-discovery. + * Users can specify a parser to handle the output of the external process + * (via {@code outputParser}) and/or a parser to extract metadata from stderr + * (via {@code stderrParser}). An optional {@code checkCommandLine} can be + * configured so that the parser lazily verifies the external tool is available. */ -@Deprecated +@TikaComponent public class ExternalParser implements Parser { + public static final long DEFAULT_TIMEOUT_MS = 60000; + + public static final String INPUT_FILE_TOKEN = "${INPUT_FILE}"; + + public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}"; + + private static Pattern INPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{INPUT_FILE}"); + private static Pattern OUTPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{OUTPUT_FILE}"); + private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class); - /** - * The token, which if present in the Command string, will - * be replaced with the input filename. - * Alternately, the input data can be streamed over STDIN. - */ - public static final String INPUT_FILE_TOKEN = "${INPUT}"; - /** - * The token, which if present in the Command string, will - * be replaced with the output filename. - * Alternately, the output data can be collected on STDOUT. - */ - public static final String OUTPUT_FILE_TOKEN = "${OUTPUT}"; - private static final long serialVersionUID = -1079128990650687037L; - //make this parameterizable - private final long timeoutMs = 60000; - /** - * Media types supported by the external program. - */ - private Set<MediaType> supportedTypes = Collections.emptySet(); + private final ExternalParserConfig config; - /** - * Regular Expressions to run over STDOUT to - * extract Metadata. - */ - private Map<Pattern, String> metadataPatterns = null; - /** - * The external command to invoke. - * - * @see Runtime#exec(String[]) - */ - private String[] command = new String[]{"cat"}; - /** - * A consumer for ignored Lines - */ - private LineConsumer ignoredLineConsumer = LineConsumer.NULL; + // Cached values derived from config + private final Set<MediaType> supportedTypes; + private final List<String> commandLine; + private final Parser outputParser; + private final Parser stderrParser; + + // Lazy check state + private final String[] checkCmd; + private final int[] checkErrorCodes; + private volatile Boolean checkResult; /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. - * Note: calling this starts a new thread and blocks the current(caller) - * thread until the new thread dies - * - * @param stream stream to be ignored + * Default constructor - not typically useful since ExternalParser requires configuration. */ - private static void ignoreStream(final InputStream stream) { - ignoreStream(stream, true); + public ExternalParser() { + this(new ExternalParserConfig()); } /** - * Starts a thread that reads and discards the contents of the - * standard stream of the given process. Potential exceptions - * are ignored, and the stream is closed once fully processed. - * - * @param stream stream to sent to black hole (a k a null) - * @param waitForDeath when {@code true} the caller thread will be - * blocked till the death of new thread. - * @return The thread that is created and started + * Programmatic constructor with typed config. */ - private static Thread ignoreStream(final InputStream stream, boolean waitForDeath) { - Thread t = new Thread(() -> { - try { - IOUtils.copy(stream, NullOutputStream.INSTANCE); - } catch (IOException e) { - //swallow - } finally { - IOUtils.closeQuietly(stream); - } - }); - t.start(); - if (waitForDeath) { - try { - t.join(); - } catch (InterruptedException ignore) { + public ExternalParser(ExternalParserConfig config) { + this.config = config; + this.supportedTypes = new HashSet<>(); + for (String s : config.getSupportedTypes()) { + this.supportedTypes.add(MediaType.parse(s)); + } + this.commandLine = new ArrayList<>(config.getCommandLine()); + this.outputParser = config.getOutputParser() != null ? + config.getOutputParser() : EmptyParser.INSTANCE; + this.stderrParser = config.getStderrParser(); + + // Set up lazy check + if (config.getCheckCommandLine() != null && !config.getCheckCommandLine().isEmpty()) { + this.checkCmd = config.getCheckCommandLine().toArray(new String[0]); + if (config.getCheckErrorCodes() != null && !config.getCheckErrorCodes().isEmpty()) { + this.checkErrorCodes = config.getCheckErrorCodes().stream() + .mapToInt(Integer::intValue).toArray(); + } else { + this.checkErrorCodes = new int[]{127}; } + this.checkResult = null; // will be lazily evaluated + } else { + this.checkCmd = null; + this.checkErrorCodes = null; + this.checkResult = Boolean.TRUE; // no check configured, always available } - return t; } /** - * Checks to see if the command can be run. Typically used with - * something like "myapp --version" to check to see if "myapp" - * is installed and on the path. - * - * @param checkCmd The check command to run - * @param errorValue What is considered an error value? + * JSON config constructor - used for deserialization. */ - public static boolean check(String checkCmd, int... errorValue) { - return check(new String[]{checkCmd}, errorValue); + public ExternalParser(JsonConfig jsonConfig) { + this(ConfigDeserializer.buildConfig(jsonConfig, ExternalParserConfig.class)); } - public static boolean check(String[] checkCmd, int... errorValue) { - if (errorValue.length == 0) { - errorValue = new int[]{127}; - } - - Process process = null; - try { - process = Runtime.getRuntime().exec(checkCmd); - Thread stdErrSuckerThread = ignoreStream(process.getErrorStream(), false); - Thread stdOutSuckerThread = ignoreStream(process.getInputStream(), false); - stdErrSuckerThread.join(); - stdOutSuckerThread.join(); - //make the timeout parameterizable - boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS); - if (!finished) { - throw new TimeoutException(); - } - int result = process.exitValue(); - LOG.debug("exit value for {}: {}", checkCmd[0], result); - for (int err : errorValue) { - if (result == err) { - return false; + @Override + public Set<MediaType> getSupportedTypes(ParseContext context) { + if (checkResult == null) { + synchronized (this) { + if (checkResult == null) { + checkResult = ProcessUtils.checkCommand(checkCmd, checkErrorCodes); } } - return true; - } catch (IOException | InterruptedException | TimeoutException e) { - LOG.debug("exception trying to run " + checkCmd[0], e); - // Some problem, command is there or is broken - return false; - } catch (SecurityException se) { - // External process execution is banned by the security manager - throw se; - } catch (Error err) { - if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") || - err.getMessage().contains("UNIXProcess"))) { - LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err); - //"Error forking command due to JVM locale bug - //(see TIKA-1526 and SOLR-6387)" - return false; - } - //throw if a different kind of error - throw err; - } finally { - if (process != null) { - process.destroyForcibly(); - } } + return checkResult ? supportedTypes : Collections.emptySet(); } - public Set<MediaType> getSupportedTypes(ParseContext context) { - return getSupportedTypes(); - } - - public Set<MediaType> getSupportedTypes() { - return supportedTypes; - } - - public void setSupportedTypes(Set<MediaType> supportedTypes) { - this.supportedTypes = Collections.unmodifiableSet(new HashSet<>(supportedTypes)); - } - - public String[] getCommand() { - return command; - } - - /** - * Sets the command to be run. This can include either of - * {@link #INPUT_FILE_TOKEN} or {@link #OUTPUT_FILE_TOKEN} - * if the command needs filenames. - * - * @see Runtime#exec(String[]) - */ - public void setCommand(String... command) { - this.command = command; - } - - /** - * Gets lines consumer - * - * @return consumer instance - */ - public LineConsumer getIgnoredLineConsumer() { - return ignoredLineConsumer; - } - - /** - * Set a consumer for the lines ignored by the parse functions - * - * @param ignoredLineConsumer consumer instance - */ - public void setIgnoredLineConsumer(LineConsumer ignoredLineConsumer) { - this.ignoredLineConsumer = ignoredLineConsumer; - } - - public Map<Pattern, String> getMetadataExtractionPatterns() { - return metadataPatterns; - } - - /** - * Sets the map of regular expression patterns and Metadata - * keys. Any matching patterns will have the matching - * metadata entries set. - * Set this to null to disable Metadata extraction. - */ - public void setMetadataExtractionPatterns(Map<Pattern, String> patterns) { - this.metadataPatterns = patterns; - } - - /** - * Executes the configured external command and passes the given document - * stream as a simple XHTML document to the given SAX content handler. - * Metadata is only extracted if {@link #setMetadataExtractionPatterns(Map)} - * has been called to set patterns. - */ + @Override public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); - - TemporaryResources tmp = new TemporaryResources(); - try { - parse(tis, xhtml, metadata, tmp); - } finally { - tmp.dispose(); - } - } - - private void parse(TikaInputStream tis, XHTMLContentHandler xhtml, Metadata metadata, - TemporaryResources tmp) throws IOException, SAXException, TikaException { - boolean inputToStdIn = true; - boolean outputFromStdOut = true; - boolean hasPatterns = (metadataPatterns != null && !metadataPatterns.isEmpty()); - - File output = null; - - // Build our command - String[] cmd; - if (command.length == 1) { - cmd = command[0].split(" "); - } else { - cmd = new String[command.length]; - System.arraycopy(command, 0, cmd, 0, command.length); - } - for (int i = 0; i < cmd.length; i++) { - if (cmd[i].contains(INPUT_FILE_TOKEN)) { - cmd[i] = cmd[i].replace(INPUT_FILE_TOKEN, tis.getFile().getPath()); - inputToStdIn = false; - } - if (cmd[i].contains(OUTPUT_FILE_TOKEN)) { - output = tmp.createTemporaryFile(); - outputFromStdOut = false; - cmd[i] = cmd[i].replace(OUTPUT_FILE_TOKEN, output.getPath()); - } - } - - // Execute - Process process = null; - try { - if (cmd.length == 1) { - process = Runtime.getRuntime().exec(cmd[0]); - } else { - process = Runtime.getRuntime().exec(cmd); - } - } catch (Exception e) { - LOG.warn("problem with process exec", e); - } - - try { - if (inputToStdIn) { - sendInput(process, tis); - } else { - process.getOutputStream().close(); - } - - InputStream out = process.getInputStream(); - InputStream err = process.getErrorStream(); - - if (hasPatterns) { - extractMetadata(err, metadata); - - if (outputFromStdOut) { - extractOutput(out, xhtml); + //this may remain null, depending on whether the external parser writes to a file + Path outFile = null; + try (TemporaryResources tmp = new TemporaryResources()) { + Path p = tis.getPath(); + List<String> thisCommandLine = new ArrayList<>(); + Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher(""); + Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher(""); + boolean outputFileInCommandline = false; + for (String c : commandLine) { + if (inputMatcher.reset(c).find()) { + String updated = c.replace(INPUT_FILE_TOKEN, + ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString())); + thisCommandLine.add(updated); + } else if (outputMatcher.reset(c).find()) { + outFile = Files.createTempFile("tika-external-", ""); + String updated = c.replace(OUTPUT_FILE_TOKEN, + ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString())); + thisCommandLine.add(updated); + outputFileInCommandline = true; } else { - extractMetadata(out, metadata); + thisCommandLine.add(c); } + } + FileProcessResult result = null; + long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis(context, config.getTimeoutMs()); + if (outputFileInCommandline) { + result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), + localTimeoutMillis, config.getMaxStdOut(), config.getMaxStdErr()); } else { - ignoreStream(err); - - if (outputFromStdOut) { - extractOutput(out, xhtml); - } else { - ignoreStream(out); - } + outFile = Files.createTempFile("tika-external-", ""); + result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), + localTimeoutMillis, outFile, config.getMaxStdErr()); } - } finally { - try { - process.waitFor(); - } catch (InterruptedException ignore) { + metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); + metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); + TikaProgressTracker.update(context); + metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength()); + metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED, + result.isStdoutTruncated()); + metadata.set(ExternalProcess.STD_ERR_LENGTH, result.getStderrLength()); + metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED, + result.isStderrTruncated()); + + if (config.isReturnStdout()) { + metadata.set(ExternalProcess.STD_OUT, result.getStdout()); } - } - - // Grab the output if we haven't already - if (!outputFromStdOut) { - try (FileInputStream fileInputStream = new FileInputStream(output)) { - extractOutput(fileInputStream, xhtml); + if (config.isReturnStderr()) { + metadata.set(ExternalProcess.STD_ERR, result.getStderr()); } - } - } - - /** - * Starts a thread that extracts the contents of the standard output - * stream of the given process to the given XHTML content handler. - * The standard output stream is closed once fully processed. - * - * @param stream - * @param xhtml XHTML content handler - * @throws SAXException if the XHTML SAX events could not be handled - * @throws IOException if an input error occurred - */ - private void extractOutput(InputStream stream, XHTMLContentHandler xhtml) - throws SAXException, IOException { - try (Reader reader = new InputStreamReader(stream, UTF_8)) { - xhtml.startDocument(); - xhtml.startElement("p"); - char[] buffer = new char[1024]; - for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { - xhtml.characters(buffer, 0, n); + if (stderrParser != null && result.getStderr() != null + && !result.getStderr().isEmpty()) { + try (TikaInputStream stderrStream = TikaInputStream.get( + result.getStderr().getBytes(StandardCharsets.UTF_8))) { + stderrParser.parse(stderrStream, new org.xml.sax.helpers.DefaultHandler(), + metadata, context); + } } - xhtml.endElement("p"); + XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); + xhtml.startDocument(); + handleOutput(result, outFile, xhtml, metadata, context); xhtml.endDocument(); - } - } - - /** - * Starts a thread that sends the contents of the given input stream - * to the standard input stream of the given process. Potential - * exceptions are ignored, and the standard input stream is closed - * once fully processed. Note that the given input stream is <em>not</em> - * closed by this method. - * - * @param process process - * @param stream input stream - */ - private void sendInput(final Process process, final InputStream stream) { - Thread t = new Thread(() -> { - OutputStream stdin = process.getOutputStream(); - try { - IOUtils.copy(stream, stdin); - } catch (IOException e) { - //swallow + } finally { + if (outFile != null) { + Files.delete(outFile); } - }); - t.start(); - try { - t.join(); - } catch (InterruptedException ignore) { } } - private void extractMetadata(final InputStream stream, final Metadata metadata) { - Thread t = new Thread(() -> { - BufferedReader reader; - reader = new BufferedReader(new InputStreamReader(stream, UTF_8)); - try { - String line; - while ((line = reader.readLine()) != null) { - boolean consumed = false; - for (Map.Entry<Pattern, String> entry : metadataPatterns.entrySet()) { - Matcher m = entry.getKey().matcher(line); - if (m.find()) { - consumed = true; - if (entry.getValue() != null && - !entry.getValue().equals("")) { - metadata.add(entry.getValue(), m.group(1)); - } else { - metadata.add(m.group(1), m.group(2)); - } - } - } - if (!consumed) { - ignoredLineConsumer.consume(line); + private void handleOutput(FileProcessResult result, Path outFile, + XHTMLContentHandler xhtml, Metadata metadata, + ParseContext parseContext) throws SAXException, TikaException, + IOException { + if (outputParser == EmptyParser.INSTANCE) { + if (outFile != null) { + try (BufferedReader reader = Files.newBufferedReader(outFile)) { + String line = reader.readLine(); + while (line != null) { + //do we want to wrap this in <p></p> elements? + xhtml.characters(line); + xhtml.newline(); + line = reader.readLine(); } } - } catch (IOException e) { - // Ignore - } finally { - IOUtils.closeQuietly(reader); - IOUtils.closeQuietly(stream); + } else { + //read this in line by line and wrap <p></p> elements? + xhtml.characters(result.getStdout()); + } + } else { + if (outFile != null) { + try (TikaInputStream tis = TikaInputStream.get(outFile)) { + outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); + } + } else { + try (TikaInputStream tis = TikaInputStream.get( + result.getStdout().getBytes(StandardCharsets.UTF_8))) { + outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); + } } - }); - t.start(); - try { - t.join(); - } catch (InterruptedException ignore) { } + } /** - * Consumer contract - * - * @since Apache Tika 1.14 + * Returns the output parser used to parse the external process output. */ - public interface LineConsumer extends Serializable { - /** - * A null consumer - */ - LineConsumer NULL = line -> { - // ignores - }; - - /** - * Consume a line - * - * @param line a line of string - */ - void consume(String line); + public Parser getOutputParser() { + return outputParser; } - + /** + * Returns the configuration for this parser. + */ + public ExternalParserConfig getConfig() { + return config; + } } diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java similarity index 80% rename from tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java rename to tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java index 913565af80..b15c5307e0 100644 --- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParserConfig.java +++ b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParserConfig.java @@ -14,7 +14,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.tika.parser.external2; +package org.apache.tika.parser.external; import java.io.Serializable; import java.util.ArrayList; @@ -35,6 +35,9 @@ public class ExternalParserConfig implements Serializable { private List<String> supportedTypes = new ArrayList<>(); private List<String> commandLine = new ArrayList<>(); private Parser outputParser; + private Parser stderrParser; + private List<String> checkCommandLine; + private List<Integer> checkErrorCodes; private boolean returnStdout = false; private boolean returnStderr = true; private long timeoutMs = ExternalParser.DEFAULT_TIMEOUT_MS; @@ -68,6 +71,30 @@ public class ExternalParserConfig implements Serializable { this.outputParser = outputParser; } + public Parser getStderrParser() { + return stderrParser; + } + + public void setStderrParser(Parser stderrParser) { + this.stderrParser = stderrParser; + } + + public List<String> getCheckCommandLine() { + return checkCommandLine; + } + + public void setCheckCommandLine(List<String> checkCommandLine) { + this.checkCommandLine = checkCommandLine; + } + + public List<Integer> getCheckErrorCodes() { + return checkErrorCodes; + } + + public void setCheckErrorCodes(List<Integer> checkErrorCodes) { + this.checkErrorCodes = checkErrorCodes; + } + public boolean isReturnStdout() { return returnStdout; } diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java deleted file mode 100644 index 754bcf4454..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReader.java +++ /dev/null @@ -1,223 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external; - -import java.io.IOException; -import java.io.InputStream; -import java.util.ArrayList; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.StringTokenizer; -import java.util.regex.Pattern; -import javax.xml.parsers.DocumentBuilder; - -import org.w3c.dom.Document; -import org.w3c.dom.Element; -import org.w3c.dom.Node; -import org.w3c.dom.NodeList; -import org.xml.sax.InputSource; -import org.xml.sax.SAXException; - -import org.apache.tika.exception.TikaException; -import org.apache.tika.mime.MediaType; -import org.apache.tika.mime.MimeTypeException; -import org.apache.tika.utils.XMLReaderUtils; - -/** - * Builds up ExternalParser instances based on XML file(s) - * which define what to run, for what, and how to process - * any output metadata. - * Typically used to configure up a series of external programs - * (like catdoc or pdf2txt) to extract text content from documents. - * - * <pre> - * TODO XML DTD Here - * </pre> - */ -public final class ExternalParsersConfigReader implements ExternalParsersConfigReaderMetKeys { - - public static List<ExternalParser> read(InputStream stream) throws TikaException, IOException { - try { - DocumentBuilder builder = XMLReaderUtils.getDocumentBuilder(); - Document document = builder.parse(new InputSource(stream)); - return read(document); - } catch (SAXException e) { - throw new TikaException("Invalid parser configuration", e); - } - } - - public static List<ExternalParser> read(Document document) throws TikaException, IOException { - return read(document.getDocumentElement()); - } - - public static List<ExternalParser> read(Element element) throws TikaException, IOException { - List<ExternalParser> parsers = new ArrayList<>(); - - if (element != null && element.getTagName().equals(EXTERNAL_PARSERS_TAG)) { - NodeList nodes = element.getChildNodes(); - for (int i = 0; i < nodes.getLength(); i++) { - Node node = nodes.item(i); - if (node.getNodeType() == Node.ELEMENT_NODE) { - Element child = (Element) node; - if (child.getTagName().equals(PARSER_TAG)) { - ExternalParser p = readParser(child); - if (p != null) { - parsers.add(p); - } - } - } - } - } else { - throw new MimeTypeException( - "Not a <" + EXTERNAL_PARSERS_TAG + "/> configuration document: " + - (element != null ? element.getTagName() : "n/a")); - } - - return parsers; - } - - /** - * Builds and Returns an ExternalParser, or null if a check - * command was given that didn't match. - */ - private static ExternalParser readParser(Element parserDef) throws TikaException { - ExternalParser parser = new ExternalParser(); - - NodeList children = parserDef.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - Node node = children.item(i); - if (node.getNodeType() == Node.ELEMENT_NODE) { - Element child = (Element) node; - switch (child.getTagName()) { - case CHECK_TAG: - boolean present = readCheckTagAndCheck(child); - if (!present) { - return null; - } - break; - case COMMAND_TAG: - parser.setCommand(getString(child)); - break; - case MIMETYPES_TAG: - parser.setSupportedTypes(readMimeTypes(child)); - break; - case METADATA_TAG: - parser.setMetadataExtractionPatterns(readMetadataPatterns(child)); - break; - default: - throw new IllegalArgumentException("reaction not defined for " + child.getTagName()); - } - } - } - - return parser; - } - - private static Set<MediaType> readMimeTypes(Element mimeTypes) { - Set<MediaType> types = new HashSet<>(); - - NodeList children = mimeTypes.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - Node node = children.item(i); - if (node.getNodeType() == Node.ELEMENT_NODE) { - Element child = (Element) node; - if (child.getTagName().equals(MIMETYPE_TAG)) { - types.add(MediaType.parse(getString(child))); - } - } - } - - return types; - } - - private static Map<Pattern, String> readMetadataPatterns(Element metadataDef) { - Map<Pattern, String> metadata = new HashMap<>(); - - NodeList children = metadataDef.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - Node node = children.item(i); - if (node.getNodeType() == Node.ELEMENT_NODE) { - Element child = (Element) node; - if (child.getTagName().equals(METADATA_MATCH_TAG)) { - String metadataKey = child.getAttribute(METADATA_KEY_ATTR); - Pattern pattern = Pattern.compile(getString(child)); - metadata.put(pattern, metadataKey); - } - } - } - - return metadata; - } - - private static boolean readCheckTagAndCheck(Element checkDef) { - String command = null; - List<Integer> errorVals = new ArrayList<>(); - - NodeList children = checkDef.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - Node node = children.item(i); - if (node.getNodeType() == Node.ELEMENT_NODE) { - Element child = (Element) node; - if (child.getTagName().equals(COMMAND_TAG)) { - command = getString(child); - } - if (child.getTagName().equals(ERROR_CODES_TAG)) { - String errs = getString(child); - StringTokenizer st = new StringTokenizer(errs, ","); - while (st.hasMoreElements()) { - try { - String s = st.nextToken(); - errorVals.add(Integer.parseInt(s)); - } catch (NumberFormatException e) { - //swallow - } - } - } - } - } - - if (command != null) { - String[] theCommand = command.split(" "); - int[] errVals = new int[errorVals.size()]; - for (int i = 0; i < errVals.length; i++) { - errVals[i] = errorVals.get(i); - } - - return ExternalParser.check(theCommand, errVals); - } - - // No check command, so assume it's there - return true; - } - - private static String getString(Element element) { - StringBuilder s = new StringBuilder(); - - NodeList children = element.getChildNodes(); - for (int i = 0; i < children.getLength(); i++) { - Node node = children.item(i); - if (node.getNodeType() == Node.TEXT_NODE) { - s.append(node.getNodeValue()); - } - } - - return s.toString(); - } -} diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java deleted file mode 100644 index 86369c6cd7..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersConfigReaderMetKeys.java +++ /dev/null @@ -1,43 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external; - -/** - * Met Keys used by the {@link ExternalParsersConfigReader}. - */ -public interface ExternalParsersConfigReaderMetKeys { - - String EXTERNAL_PARSERS_TAG = "external-parsers"; - - String PARSER_TAG = "parser"; - - String COMMAND_TAG = "command"; - - String CHECK_TAG = "check"; - - String ERROR_CODES_TAG = "error-codes"; - - String MIMETYPES_TAG = "mime-types"; - - String MIMETYPE_TAG = "mime-type"; - - String METADATA_TAG = "metadata"; - - String METADATA_MATCH_TAG = "match"; - - String METADATA_KEY_ATTR = "key"; -} diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java b/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java deleted file mode 100644 index 4822a79c08..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external/ExternalParsersFactory.java +++ /dev/null @@ -1,67 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external; - -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Enumeration; -import java.util.List; - -import org.apache.tika.config.ServiceLoader; -import org.apache.tika.exception.TikaException; - -/** - * Creates instances of ExternalParser based on XML - * configuration files. - * - * @see ExternalParsersConfigReader - */ -public class ExternalParsersFactory { - - public static List<ExternalParser> create() throws IOException, TikaException { - return create(new ServiceLoader()); - } - - public static List<ExternalParser> create(ServiceLoader loader) - throws IOException, TikaException { - return create("tika-external-parsers.xml", loader); - } - - public static List<ExternalParser> create(String filename, ServiceLoader loader) - throws IOException, TikaException { - String filepath = - ExternalParsersFactory.class.getPackage().getName().replace('.', '/') + "/" + - filename; - Enumeration<URL> files = loader.findServiceResources(filepath); - ArrayList<URL> list = Collections.list(files); - URL[] urls = list.toArray(new URL[0]); - return create(urls); - } - - public static List<ExternalParser> create(URL... urls) throws IOException, TikaException { - List<ExternalParser> parsers = new ArrayList<>(); - for (URL url : urls) { - try (InputStream stream = url.openStream()) { - parsers.addAll(ExternalParsersConfigReader.read(stream)); - } - } - return parsers; - } -} diff --git a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java b/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java deleted file mode 100644 index 4ee27b9d65..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external/package-info.java +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/** - * External parser process. - */ [email protected]("1.0.0") -package org.apache.tika.parser.external; diff --git a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java b/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java deleted file mode 100644 index 1c87ecaf99..0000000000 --- a/tika-core/src/main/java/org/apache/tika/parser/external2/ExternalParser.java +++ /dev/null @@ -1,227 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external2; - -import java.io.BufferedReader; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.ArrayList; -import java.util.HashSet; -import java.util.List; -import java.util.Set; -import java.util.regex.Matcher; -import java.util.regex.Pattern; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.xml.sax.ContentHandler; -import org.xml.sax.SAXException; - -import org.apache.tika.config.ConfigDeserializer; -import org.apache.tika.config.JsonConfig; -import org.apache.tika.config.TikaComponent; -import org.apache.tika.config.TikaProgressTracker; -import org.apache.tika.config.TimeoutLimits; -import org.apache.tika.exception.TikaException; -import org.apache.tika.io.TemporaryResources; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.ExternalProcess; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.EmptyParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.sax.BodyContentHandler; -import org.apache.tika.sax.XHTMLContentHandler; -import org.apache.tika.utils.FileProcessResult; -import org.apache.tika.utils.ProcessUtils; - -/** - * This is a next generation external parser that uses some of the more - * recent additions to Tika. This is an experimental alternative to the - * {@link org.apache.tika.parser.external.ExternalParser}. - * Specifically, it relies more on configuration than the SPI model. - * Further, users can specify a parser to handle the output - * of the external process. - */ -@TikaComponent -public class ExternalParser implements Parser { - - public static final long DEFAULT_TIMEOUT_MS = 60000; - - public static final String INPUT_FILE_TOKEN = "${INPUT_FILE}"; - - public static final String OUTPUT_FILE_TOKEN = "${OUTPUT_FILE}"; - - private static Pattern INPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{INPUT_FILE}"); - private static Pattern OUTPUT_TOKEN_MATCHER = Pattern.compile("\\$\\{OUTPUT_FILE}"); - - private static final Logger LOG = LoggerFactory.getLogger(ExternalParser.class); - - private final ExternalParserConfig config; - - // Cached values derived from config - private final Set<MediaType> supportedTypes; - private final List<String> commandLine; - private final Parser outputParser; - - /** - * Default constructor - not typically useful since ExternalParser requires configuration. - */ - public ExternalParser() { - this(new ExternalParserConfig()); - } - - /** - * Programmatic constructor with typed config. - */ - public ExternalParser(ExternalParserConfig config) { - this.config = config; - this.supportedTypes = new HashSet<>(); - for (String s : config.getSupportedTypes()) { - this.supportedTypes.add(MediaType.parse(s)); - } - this.commandLine = new ArrayList<>(config.getCommandLine()); - this.outputParser = config.getOutputParser() != null ? - config.getOutputParser() : EmptyParser.INSTANCE; - } - - /** - * JSON config constructor - used for deserialization. - */ - public ExternalParser(JsonConfig jsonConfig) { - this(ConfigDeserializer.buildConfig(jsonConfig, ExternalParserConfig.class)); - } - - @Override - public Set<MediaType> getSupportedTypes(ParseContext context) { - return supportedTypes; - } - - @Override - public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, - ParseContext context) throws IOException, SAXException, TikaException { - //this may remain null, depending on whether the external parser writes to a file - Path outFile = null; - try (TemporaryResources tmp = new TemporaryResources()) { - Path p = tis.getPath(); - List<String> thisCommandLine = new ArrayList<>(); - Matcher inputMatcher = INPUT_TOKEN_MATCHER.matcher(""); - Matcher outputMatcher = OUTPUT_TOKEN_MATCHER.matcher(""); - boolean outputFileInCommandline = false; - for (String c : commandLine) { - if (inputMatcher.reset(c).find()) { - String updated = c.replace(INPUT_FILE_TOKEN, - ProcessUtils.escapeCommandLine(p.toAbsolutePath().toString())); - thisCommandLine.add(updated); - } else if (outputMatcher.reset(c).find()) { - outFile = Files.createTempFile("tika-external2-", ""); - String updated = c.replace(OUTPUT_FILE_TOKEN, - ProcessUtils.escapeCommandLine(outFile.toAbsolutePath().toString())); - thisCommandLine.add(updated); - outputFileInCommandline = true; - } else { - thisCommandLine.add(c); - } - } - FileProcessResult result = null; - long localTimeoutMillis = TimeoutLimits.getProcessTimeoutMillis(context, config.getTimeoutMs()); - if (outputFileInCommandline) { - result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), - localTimeoutMillis, config.getMaxStdOut(), config.getMaxStdErr()); - } else { - outFile = Files.createTempFile("tika-external2-", ""); - result = ProcessUtils.execute(new ProcessBuilder(thisCommandLine), - localTimeoutMillis, outFile, config.getMaxStdErr()); - } - metadata.set(ExternalProcess.IS_TIMEOUT, result.isTimeout()); - metadata.set(ExternalProcess.EXIT_VALUE, result.getExitValue()); - TikaProgressTracker.update(context); - metadata.set(ExternalProcess.STD_OUT_LENGTH, result.getStdoutLength()); - metadata.set(ExternalProcess.STD_OUT_IS_TRUNCATED, - result.isStdoutTruncated()); - metadata.set(ExternalProcess.STD_ERR_LENGTH, result.getStderrLength()); - metadata.set(ExternalProcess.STD_ERR_IS_TRUNCATED, - result.isStderrTruncated()); - - if (config.isReturnStdout()) { - metadata.set(ExternalProcess.STD_OUT, result.getStdout()); - } - if (config.isReturnStderr()) { - metadata.set(ExternalProcess.STD_ERR, result.getStderr()); - } - XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata, context); - xhtml.startDocument(); - handleOutput(result, outFile, xhtml, metadata, context); - xhtml.endDocument(); - } finally { - if (outFile != null) { - Files.delete(outFile); - } - } - } - - private void handleOutput(FileProcessResult result, Path outFile, - XHTMLContentHandler xhtml, Metadata metadata, - ParseContext parseContext) throws SAXException, TikaException, - IOException { - if (outputParser == EmptyParser.INSTANCE) { - if (outFile != null) { - try (BufferedReader reader = Files.newBufferedReader(outFile)) { - String line = reader.readLine(); - while (line != null) { - //do we want to wrap this in <p></p> elements? - xhtml.characters(line); - xhtml.newline(); - line = reader.readLine(); - } - } - } else { - //read this in line by line and wrap <p></p> elements? - xhtml.characters(result.getStdout()); - } - } else { - if (outFile != null) { - try (TikaInputStream tis = TikaInputStream.get(outFile)) { - outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); - } - } else { - try (TikaInputStream tis = TikaInputStream.get( - result.getStdout().getBytes(StandardCharsets.UTF_8))) { - outputParser.parse(tis, new BodyContentHandler(xhtml), metadata, parseContext); - } - } - } - - } - - /** - * Returns the output parser used to parse the external process output. - */ - public Parser getOutputParser() { - return outputParser; - } - - /** - * Returns the configuration for this parser. - */ - public ExternalParserConfig getConfig() { - return config; - } -} diff --git a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java index 5ee5865fe1..eb983ec7de 100644 --- a/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java +++ b/tika-core/src/main/java/org/apache/tika/utils/ProcessUtils.java @@ -23,9 +23,14 @@ import java.nio.file.Path; import java.util.UUID; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class ProcessUtils { + private static final Logger LOG = LoggerFactory.getLogger(ProcessUtils.class); private static final ConcurrentHashMap<String, Process> PROCESS_MAP = new ConcurrentHashMap<>(); @@ -219,4 +224,73 @@ public class ProcessUtils { } + /** + * Checks to see if the command can be run. Typically used with + * something like "myapp --version" to check to see if "myapp" + * is installed and on the path. + * + * @param checkCmd The check command to run + * @param errorValue What is considered an error value? Default is 127 (command not found). + * @return true if the command ran successfully (exit code not in errorValue list) + */ + public static boolean checkCommand(String checkCmd, int... errorValue) { + return checkCommand(new String[]{checkCmd}, errorValue); + } + + /** + * Checks to see if the command can be run. Typically used with + * something like {@code new String[]{"myapp", "--version"}} to check to see if "myapp" + * is installed and on the path. + * + * @param checkCmd The check command to run + * @param errorValue What is considered an error value? Default is 127 (command not found). + * @return true if the command ran successfully (exit code not in errorValue list) + */ + public static boolean checkCommand(String[] checkCmd, int... errorValue) { + if (errorValue.length == 0) { + errorValue = new int[]{127}; + } + + Process process = null; + try { + process = Runtime.getRuntime().exec(checkCmd); + StreamGobbler outGobbler = new StreamGobbler(process.getInputStream(), 0); + StreamGobbler errGobbler = new StreamGobbler(process.getErrorStream(), 0); + Thread outThread = new Thread(outGobbler); + Thread errThread = new Thread(errGobbler); + outThread.start(); + errThread.start(); + boolean finished = process.waitFor(60000, TimeUnit.MILLISECONDS); + if (!finished) { + throw new TimeoutException(); + } + outThread.join(1000); + errThread.join(1000); + int result = process.exitValue(); + LOG.debug("exit value for {}: {}", checkCmd[0], result); + for (int err : errorValue) { + if (result == err) { + return false; + } + } + return true; + } catch (IOException | InterruptedException | TimeoutException e) { + LOG.debug("exception trying to run " + checkCmd[0], e); + return false; + } catch (SecurityException se) { + throw se; + } catch (Error err) { + if (err.getMessage() != null && (err.getMessage().contains("posix_spawn") || + err.getMessage().contains("UNIXProcess"))) { + LOG.debug("(TIKA-1526): exception trying to run: " + checkCmd[0], err); + return false; + } + throw err; + } finally { + if (process != null) { + process.destroyForcibly(); + } + } + } + } diff --git a/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml b/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml deleted file mode 100644 index 9a1f356834..0000000000 --- a/tika-core/src/main/resources/org/apache/tika/parser/external/tika-external-parsers.xml +++ /dev/null @@ -1,117 +0,0 @@ -<?xml version="1.0" encoding="UTF-8"?> -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> -<!-- - Description: This xml file defines external commands to be run by Tika - as parsers. ---> -<external-parsers> - <!-- This example uses ffmpeg for video metadata extraction --> - <parser> - <check> - <command>ffmpeg -version</command> - <error-codes>126,127</error-codes> - </check> - <command>ffmpeg -i ${INPUT}</command> - <mime-types> - <mime-type>video/avi</mime-type> - <mime-type>video/mpeg</mime-type> - <mime-type>video/x-msvideo</mime-type> - </mime-types> - <metadata> - <match key="xmpDM:audioSampleRate">\s*Stream.*:.+Audio:.*,\s+(\d+)\s+Hz,.*</match> - <match key="xmpDM:audioChannelType">\s*Stream.*:.+Audio:.*\d+\s+Hz,\s+(\d{1,2})\s+channels.*</match> - <match key="xmpDM:audioCompressor">\s*Stream.*:.+Audio:\s+([A-Za-z0-9_\(\)/\[\] ]+),.*</match> - <match key="xmpDM:duration">\s*Duration:\s*([0-9:\.]+),.*</match> - <match key="xmpDM:fileDataRate">\s*Duration:.*,\s*bitrate:\s+([0-9A-Za-z/ ]+).*</match> - <match key="xmpDM:videoColorSpace">\s*Stream.*:\s+Video:\s+[A-Za-z0-9\(\)/ ]+,\s+([A-Za-z0-9\(\) ,]+),\s+[0-9x]+,.*</match> - <match key="xmpDM:videoCompressor">\s*Stream.*:\s+Video:\s+([A-Za-z0-9\(\)/ ]+),.*</match> - <match key="xmpDM:videoFrameRate">\s*Stream.*:\s+Video:.*,\s+([0-9]+)\s+fps,.*</match> - <match key="encoder">\s*encoder\s*\:\s*(\w+).*</match> - <match key="videoResolution">\s*Stream.*:\s+Video:.*,\s+([0-9x]+),.*</match> - </metadata> - </parser> - <parser> - <check> - <command>exiftool -ver</command> - <error-codes>126,127</error-codes> - </check> - <command>env FOO=${OUTPUT} exiftool ${INPUT}</command> - <mime-types> - <mime-type>video/avi</mime-type> - <mime-type>video/mpeg</mime-type> - <mime-type>video/x-msvideo</mime-type> - <mime-type>video/mp4</mime-type> - </mime-types> - <metadata> - <match>\s*([A-Za-z0-9/ \(\)]+\S{1})\s+:\s+([A-Za-z0-9\(\)\[\] \:\-\.]+)\s*</match> - </metadata> - </parser> - <parser> - <check> - <command>sox --version</command> - <error-codes>126,127</error-codes> - </check> - <command>env FOO=${OUTPUT} sox --info ${INPUT}</command> - <mime-types> - <mime-type>audio/3gpp</mime-type> - <mime-type>audio/3gpp2</mime-type> - <mime-type>audio/aac</mime-type> - <mime-type>audio/ac3</mime-type> - <mime-type>audio/basic</mime-type> - <mime-type>audio/L24</mime-type> - <mime-type>audio/mid</mime-type> - <mime-type>audio/mpeg</mime-type> - <mime-type>audio/mpeg3</mime-type> - <mime-type>audio/x-mpeg-3</mime-type> - <mime-type>audio/mpeg4-generic</mime-type> - <mime-type>audio/mp4</mime-type> - <mime-type>audio/mp3</mime-type> - <mime-type>audio/x-aiff</mime-type> - <mime-type>audio/PCMA</mime-type> - <mime-type>audio/PCMA-WB</mime-type> - <mime-type>audio/PCMU</mime-type> - <mime-type>audio/PCMU-WB</mime-type> - <mime-type>audio/ogg</mime-type> - <mime-type>audio/vorbis</mime-type> - <mime-type>audio/vnd.wav</mime-type> - <mime-type>audio/vnd.wave</mime-type> - <mime-type>audio/vnd.rn-realaudio</mime-type> - <mime-type>audio/wav</mime-type> - <mime-type>audio/wave</mime-type> - <mime-type>audio/x-wav</mime-type> - </mime-types> - <metadata> - <!-- Channels : 1 --> - <match key="xmpDM:audioChannelType">\s*Channels.*:\s+(\d+)\s*</match> - <!-- Sample Rate : 44100 --> - <match key="xmpDM:audioSampleRate">\s*Sample Rate.*:\s+(\d+)\s*</match> - <!-- Precision : 16-bit --> - <match key="xmpDM:audioSampleType">\s*Precision.*:\s+([\d\w-]+)\s*</match> - <!-- Duration : 00:00:02.50 = 110298 samples = 187.582 CDDA sectors --> - <match key="xmpDM:duration">\s*Duration.*:\s+([\d:\.]+)\s*</match> - <!-- File Size : 221k --> - <match key="File Size">\s*File Size.*:\s+([\d\w]+)\s*</match> - <!-- Bit Rate : 706k --> - <match key="xmpDM:fileDataRate">\s*Bit Rate.*:\s+([\d\w]+)\s*</match> - <!-- Sample Encoding: 16-bit Signed Integer PCM --> - <match key="Sample Encoding">\s*Sample Encoding.*:\s+(.*)\s*</match> - <!-- Comment : 'Comment=Processed by SoX' --> - <match key="xmpDM:logComment">\s*Comment.*:\s+(.*)\s*</match> - </metadata> - </parser> -</external-parsers> diff --git a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java index 3b77375622..a8f49a2fbe 100644 --- a/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java +++ b/tika-detectors/tika-detector-siegfried/src/main/java/org/apache/tika/detect/siegfried/SiegfriedDetector.java @@ -35,7 +35,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.Property; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; @@ -170,7 +169,7 @@ public class SiegfriedDetector implements Detector { public static boolean checkHasSiegfried(String siegfriedCommandPath) { String[] commandline = new String[]{siegfriedCommandPath, "-version"}; - return ExternalParser.check(commandline); + return ProcessUtils.checkCommand(commandline); } /** diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java index 6f1a071419..68b5712e8f 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/main/java/org/apache/tika/parser/gdal/GDALParser.java @@ -49,7 +49,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.XHTMLContentHandler; import org.apache.tika.utils.FileProcessResult; import org.apache.tika.utils.ProcessUtils; @@ -152,7 +151,7 @@ public class GDALParser implements Parser { private long timeoutMs = DEFAULT_TIMEOUT_MS; public GDALParser() { - setCommand("gdalinfo ${INPUT}"); + setCommand("gdalinfo ${INPUT_FILE}"); } public String getCommand() { @@ -185,7 +184,7 @@ public class GDALParser implements Parser { public void parse(TikaInputStream tis, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException { - if (!ExternalParser.check("gdalinfo")) { + if (!ProcessUtils.checkCommand("gdalinfo")) { return; } diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java index e6a8b00f51..13fd87d021 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-module/src/test/java/org/apache/tika/parser/gdal/TestGDALParser.java @@ -28,8 +28,8 @@ import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.ProcessUtils; /** * Test harness for the GDAL parser. @@ -39,7 +39,7 @@ public class TestGDALParser extends TikaTest { private boolean canRun() { String[] checkCmd = {"gdalinfo"}; // If GDAL is not on the path, do not run the test. - return ExternalParser.check(checkCmd); + return ProcessUtils.checkCommand(checkCmd); } @Test @@ -139,7 +139,7 @@ public class TestGDALParser extends TikaTest { // If the exit code is 1 (meaning FITS isn't supported by the installed version of // gdalinfo, don't run this test. String[] fitsCommand = {"gdalinfo", getResourceAsUrl(fitsFilename).getPath()}; - assumeTrue(ExternalParser.check(fitsCommand, 1)); + assumeTrue(ProcessUtils.checkCommand(fitsCommand, 1)); String expectedAllgMin = "-7.319537E1"; String expectedAtodcorr = "COMPLETE"; diff --git a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java index cb2d2236b0..99e382c07d 100644 --- a/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java +++ b/tika-parsers/tika-parsers-extended/tika-parser-scientific-package/src/test/java/org/apache/tika/parser/scientific/integration/TestParsers.java @@ -30,10 +30,8 @@ import org.junit.jupiter.api.Test; import org.apache.tika.exception.TikaException; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.DefaultParser; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.external.CompositeExternalParser; import org.apache.tika.parser.ocr.TesseractOCRParser; /** @@ -58,11 +56,6 @@ public class TestParsers { } int checked = 0; - //The initial lists were developed with exiftool installed. We have since - //modified the 2.4.1-* files to act as if no exiftool is installed. - //However, on systems with ffmpeg or exiftool installed, we need - //to override those file formats - CompositeParser externalParser = (CompositeParser) new CompositeExternalParser(); try (BufferedReader reader = new BufferedReader(new InputStreamReader( getClass().getResourceAsStream(path241), @@ -73,10 +66,6 @@ public class TestParsers { String mediaType = data[0]; String parserClass = data[1]; - Parser external = externalParser.getParsers().get(MediaType.parse(mediaType)); - if (external != null) { - parserClass = externalParser.getClass().toString(); - } assertEquals(parserClass, currentDefault.get(mediaType), "for mediaType '" + mediaType + "'"); checked++; diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java index 9459173d6e..9a1b47d6fe 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParserConfig.java @@ -28,7 +28,7 @@ import org.slf4j.LoggerFactory; import org.apache.tika.config.Initializable; import org.apache.tika.exception.TikaConfigException; -import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; public class DWGParserConfig implements Serializable, Initializable { @@ -62,7 +62,7 @@ public class DWGParserConfig implements Serializable, Initializable { // Try running DWGRead from there, and see if it exists + works String[] checkCmd = { dwgRead }; - boolean hasDwgRead = ExternalParser.check(checkCmd); + boolean hasDwgRead = ProcessUtils.checkCommand(checkCmd); LOG.debug("hasDwgRead (path: " + Arrays.toString(checkCmd) + "): " + hasDwgRead); return hasDwgRead; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java index 0354b15155..e8c9c2d848 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/test/java/org/apache/tika/parser/dwg/DWGParserTest.java @@ -39,8 +39,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.CompositeParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; public class DWGParserTest extends TikaTest { @@ -53,7 +53,7 @@ public class DWGParserTest extends TikaTest { // Try running DWGRead from there, and see if it exists + works String[] checkCmd = { dwgRead }; - return ExternalParser.check(checkCmd); + return ProcessUtils.checkCommand(checkCmd); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java index 2639b457ae..27cf577229 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-ocr-module/src/main/java/org/apache/tika/parser/ocr/TesseractOCRParser.java @@ -75,11 +75,11 @@ import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.AbstractExternalProcessParser; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; import org.apache.tika.sax.EmbeddedContentHandler; import org.apache.tika.sax.TeeContentHandler; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; import org.apache.tika.utils.XMLReaderUtils; @@ -209,7 +209,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements // Try running Tesseract from there, and see if it exists + works String[] checkCmd = {tesseract}; - boolean hasTesseract = ExternalParser.check(checkCmd); + boolean hasTesseract = ProcessUtils.checkCommand(checkCmd); LOG.debug("hasTesseract (path: " + Arrays.toString(checkCmd) + "): " + hasTesseract); return hasTesseract; } @@ -231,7 +231,7 @@ public class TesseractOCRParser extends AbstractExternalProcessParser implements // Try running ImageMagick program from there, and see if it exists + works String[] checkCmd = {fullImageMagickPath}; - this.hasImageMagick = ExternalParser.check(checkCmd); + this.hasImageMagick = ProcessUtils.checkCommand(checkCmd); if (!this.hasImageMagick) { LOG.debug("ImageMagick does not appear to be installed " + "(commandline: " + fullImageMagickPath + ")"); diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java index 69316a74a8..213647eba6 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/test/java/org/apache/tika/renderer/pdf/poppler/PopplerRendererTest.java @@ -34,10 +34,10 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.TikaPagedText; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.renderer.PageBasedRenderResults; import org.apache.tika.renderer.PageRangeRequest; import org.apache.tika.renderer.RenderResult; +import org.apache.tika.utils.ProcessUtils; public class PopplerRendererTest { @@ -45,7 +45,7 @@ public class PopplerRendererTest { @BeforeAll static void checkPoppler() { - hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"}); + hasPoppler = ProcessUtils.checkCommand(new String[]{"pdftoppm", "-v"}); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java index 903b764817..3be4f2fa29 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pkg-module/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java @@ -26,8 +26,8 @@ import org.apache.tika.exception.EncryptedDocumentException; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.ProcessUtils; /** * Test case for parsing unrar files. @@ -40,7 +40,7 @@ public class UnrarParserTest extends AbstractPkgTest { */ @Test public void testEncryptedRar() throws Exception { - assumeTrue(ExternalParser.check("unrar")); + assumeTrue(ProcessUtils.checkCommand("unrar")); Parser parser = new UnrarParser(); try (TikaInputStream tis = getResourceAsStream("/test-documents/test-documents-enc.rar")) { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java index cd863a718c..5d0ecc8023 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/main/java/org/apache/tika/parser/strings/StringsParser.java @@ -49,8 +49,8 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.XHTMLContentHandler; +import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.SystemUtils; /** @@ -146,7 +146,7 @@ public class StringsParser implements Parser, Initializable { String[] checkCmd = {stringsProg, "--version"}; try { - stringsPresent = ExternalParser.check(checkCmd); + stringsPresent = ProcessUtils.checkCommand(checkCmd); if (!stringsPresent) { return; } @@ -157,7 +157,7 @@ public class StringsParser implements Parser, Initializable { "/dev/null"}; int[] errorValues = {1, 2}; // Exit status code: 1 = general error; 2 = incorrect usage. - hasEncodingOption = ExternalParser.check(checkOpt, errorValues); + hasEncodingOption = ProcessUtils.checkCommand(checkOpt, errorValues); } } catch (NoClassDefFoundError ncdfe) { // This happens under OSGi + Fork Parser - see TIKA-1507 diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java index 76372396ad..9a08aebc59 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-text-module/src/test/java/org/apache/tika/parser/strings/StringsParserTest.java @@ -29,13 +29,13 @@ import org.apache.tika.TikaTest; import org.apache.tika.io.TikaInputStream; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.utils.ProcessUtils; public class StringsParserTest extends TikaTest { public static boolean canRun() { String[] checkCmd = {new StringsParser().getDefaultConfig().getStringsPath() + getStringsProg(), "--version"}; - return ExternalParser.check(checkCmd); + return ProcessUtils.checkCommand(checkCmd); } @Test diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser deleted file mode 100644 index 37f87a4595..0000000000 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.parser.external.CompositeExternalParser \ No newline at end of file diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java index 8f9b957e90..dea1a9bc09 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java @@ -43,7 +43,6 @@ import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.metadata.XMPDM; import org.apache.tika.mime.MediaType; -import org.apache.tika.parser.external.CompositeExternalParser; import org.apache.tika.parser.ogg.FlacParser; import org.apache.tika.parser.ogg.OpusParser; import org.apache.tika.parser.ogg.VorbisParser; @@ -407,11 +406,6 @@ public class AutoDetectParserTest extends TikaTest { } } - @Test - public void testExternalParserIsLoaded() { - Parser p = find((CompositeParser) AUTO_DETECT_PARSER, CompositeExternalParser.class); - assertNotNull(p); - } @Test public void testWriteLimit() throws Exception { diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java index fa160184b8..8765905ecb 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java @@ -61,13 +61,13 @@ import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.parser.RecursiveParserWrapper; -import org.apache.tika.parser.external.ExternalParser; import org.apache.tika.parser.image.ImageMetadataExtractor; import org.apache.tika.parser.ocr.TesseractOCRConfig; import org.apache.tika.parser.ocr.TesseractOCRParser; import org.apache.tika.parser.xml.XMLProfiler; import org.apache.tika.sax.BasicContentHandlerFactory; import org.apache.tika.sax.RecursiveParserWrapperHandler; +import org.apache.tika.utils.ProcessUtils; import org.apache.tika.utils.StringUtils; public class PDFParserTest extends TikaTest { @@ -94,7 +94,7 @@ public class PDFParserTest extends TikaTest { if (hasPoppler != null) { return hasPoppler; } - hasPoppler = ExternalParser.check(new String[]{"pdftoppm", "-v"}); + hasPoppler = ProcessUtils.checkCommand(new String[]{"pdftoppm", "-v"}); return hasPoppler; } diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java index 8c2aea0b7f..c671e01049 100644 --- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java +++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/pkg/UnrarParserTest.java @@ -27,7 +27,7 @@ import org.apache.tika.TikaLoaderHelper; import org.apache.tika.metadata.Metadata; import org.apache.tika.metadata.TikaCoreProperties; import org.apache.tika.parser.Parser; -import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.utils.ProcessUtils; /** @@ -41,7 +41,7 @@ public class UnrarParserTest extends AbstractPkgTest { */ @Test public void testEmbedded() throws Exception { - assumeTrue(ExternalParser.check("unrar")); + assumeTrue(ProcessUtils.checkCommand("unrar")); // Expected embedded resources in test-documents.rar file. String[] expectedResources = { "testHTML.html", "testEXCEL.xls", "testOpenOffice2.odt", "testPDF.pdf", diff --git a/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java new file mode 100644 index 0000000000..7cdbca5586 --- /dev/null +++ b/tika-serialization/src/test/java/org/apache/tika/parser/external/ExternalParserTest.java @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.tika.parser.external; + +import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; +import static org.junit.jupiter.api.Assertions.assertTrue; +import static org.junit.jupiter.api.Assumptions.assumeTrue; + +import java.nio.charset.StandardCharsets; +import java.util.List; + +import org.junit.jupiter.api.Test; +import org.xml.sax.ContentHandler; +import org.xml.sax.helpers.DefaultHandler; + +import org.apache.tika.TikaTest; +import org.apache.tika.config.loader.TikaLoader; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.CompositeParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ParserDecorator; +import org.apache.tika.parser.RegexCaptureParser; + +public class ExternalParserTest extends TikaTest { + + @Test + public void testConfigRegexCaptureParser() throws Exception { + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new String[]{ + "file", "--version" + })); + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557.json")); + CompositeParser p = (CompositeParser) loader.get(Parser.class); + assertEquals(1, p.getAllComponentParsers().size()); + Parser parser = p.getAllComponentParsers().get(0); + // When _mime-include is used, the parser is wrapped in a ParserDecorator + ExternalParser externalParser = (parser instanceof ParserDecorator) + ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() + : (ExternalParser) parser; + + Parser outputParser = externalParser.getOutputParser(); + assertEquals(RegexCaptureParser.class, outputParser.getClass()); + + Metadata m = new Metadata(); + ContentHandler contentHandler = new DefaultHandler(); + String output = "Something\n" + + "Title: the quick brown fox\n" + + "Author: jumped over\n" + + "Created: 10/20/2024"; + try (TikaInputStream tis = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { + outputParser.parse(tis, contentHandler, m, new ParseContext()); + } + assertEquals("the quick brown fox", m.get("title")); + } + + @Test + public void testConfigBasic() throws Exception { + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new String[]{"file", "--version"})); + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-no-output-parser.json")); + CompositeParser p = (CompositeParser) loader.get(Parser.class); + assertEquals(1, p.getAllComponentParsers().size()); + Parser parser = p.getAllComponentParsers().get(0); + // When _mime-include is used, the parser is wrapped in a ParserDecorator + ExternalParser externalParser = (parser instanceof ParserDecorator) + ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() + : (ExternalParser) parser; + + XMLResult xmlResult = getXML("example.xml", externalParser); + assertContains("<body>text/xml</body>", xmlResult.xml.replaceAll("[\r\n]", "")); + } + + @Test + public void testExifTool() throws Exception { + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand(new String[]{"exiftool", + "-ver"})); + TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-exiftool-example.json")); + Parser p = loader.loadAutoDetectParser(); + //this was the smallest pdf we had + List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", p); + assertEquals(1, metadataList.size()); + Metadata m = metadataList.get(0); + assertEquals("application/pdf", m.get("mime")); + assertEquals("1", m.get("pages")); + assertEquals("1.4", m.get("pdf:version")); + } + + @Test + public void testFfmpegConfig() throws Exception { + TikaLoader loader = TikaLoader.load( + getConfigPath(getClass(), "external-parser-ffmpeg.json")); + CompositeParser p = (CompositeParser) loader.get(Parser.class); + assertEquals(1, p.getAllComponentParsers().size()); + Parser parser = p.getAllComponentParsers().get(0); + ExternalParser externalParser = (parser instanceof ParserDecorator) + ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() + : (ExternalParser) parser; + + ExternalParserConfig config = externalParser.getConfig(); + assertNotNull(config.getCheckCommandLine()); + assertEquals(List.of("ffmpeg", "-version"), config.getCheckCommandLine()); + assertEquals(List.of(126, 127), config.getCheckErrorCodes()); + assertNotNull(config.getStderrParser()); + assertEquals(RegexCaptureParser.class, config.getStderrParser().getClass()); + assertTrue(config.isReturnStderr()); + } + + @Test + public void testSoxConfig() throws Exception { + TikaLoader loader = TikaLoader.load( + getConfigPath(getClass(), "external-parser-sox.json")); + CompositeParser p = (CompositeParser) loader.get(Parser.class); + assertEquals(1, p.getAllComponentParsers().size()); + Parser parser = p.getAllComponentParsers().get(0); + ExternalParser externalParser = (parser instanceof ParserDecorator) + ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() + : (ExternalParser) parser; + + ExternalParserConfig config = externalParser.getConfig(); + assertNotNull(config.getCheckCommandLine()); + assertEquals(List.of("sox", "--version"), config.getCheckCommandLine()); + assertNotNull(config.getStderrParser()); + assertEquals(RegexCaptureParser.class, config.getStderrParser().getClass()); + } + + @Test + public void testStderrParserExtractsMetadata() throws Exception { + // Simulate what would happen with ffmpeg stderr output + RegexCaptureParser stderrParser = new RegexCaptureParser(); + // Build a config with a captureMap programmatically + String ffmpegStderr = " Duration: 00:02:30.50, start: 0.000000, bitrate: 706 kb/s\n" + + " Stream #0:0: Video: h264 (High), yuv420p, 1280x720, 25 fps\n" + + " Stream #0:1: Audio: aac, 44100 Hz, 2 channels, fltp\n"; + + // Use the regex-capture-parser with the same patterns from the ffmpeg config + java.util.Map<String, String> captureMap = new java.util.LinkedHashMap<>(); + captureMap.put("xmpDM:duration", "\\s*Duration:\\s*([0-9:\\.]+),.*"); + captureMap.put("xmpDM:audioSampleRate", + "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*"); + + org.apache.tika.parser.RegexCaptureParserConfig regexConfig = + new org.apache.tika.parser.RegexCaptureParserConfig(); + regexConfig.setCaptureMap(captureMap); + RegexCaptureParser parser = new RegexCaptureParser(regexConfig); + + Metadata m = new Metadata(); + try (TikaInputStream tis = TikaInputStream.get( + ffmpegStderr.getBytes(StandardCharsets.UTF_8))) { + parser.parse(tis, new DefaultHandler(), m, new ParseContext()); + } + assertEquals("00:02:30.50", m.get("xmpDM:duration")); + assertEquals("44100", m.get("xmpDM:audioSampleRate")); + } + + @Test + public void testMultiExternalParsers() throws Exception { + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand( + new String[]{"exiftool", "-ver"})); + assumeTrue(org.apache.tika.utils.ProcessUtils.checkCommand( + new String[]{"ffmpeg", "-version"})); + + TikaLoader loader = TikaLoader.load( + getConfigPath(getClass(), "external-parser-multi.json")); + CompositeParser composite = (CompositeParser) loader.get(Parser.class); + List<Parser> allParsers = composite.getAllComponentParsers(); + + // Should have two separate external parser instances + assertEquals(2, allParsers.size(), "Expected 2 external parsers but got " + + allParsers.size() + ": " + allParsers); + + // Test the exiftool parser on a PDF + Parser autoDetect = loader.loadAutoDetectParser(); + List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", autoDetect); + assertEquals(1, metadataList.size()); + Metadata m = metadataList.get(0); + assertEquals("application/pdf", m.get("exiftool:MIMEType")); + assertNotNull(m.get("exiftool:PageCount")); + assertNotNull(m.get("exiftool:PDFVersion")); + } +} diff --git a/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java b/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java deleted file mode 100644 index 35e2a7a898..0000000000 --- a/tika-serialization/src/test/java/org/apache/tika/parser/external2/ExternalParserTest.java +++ /dev/null @@ -1,100 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.tika.parser.external2; - -import static org.junit.jupiter.api.Assertions.assertEquals; -import static org.junit.jupiter.api.Assumptions.assumeTrue; - -import java.nio.charset.StandardCharsets; -import java.util.List; - -import org.junit.jupiter.api.Test; -import org.xml.sax.ContentHandler; -import org.xml.sax.helpers.DefaultHandler; - -import org.apache.tika.TikaTest; -import org.apache.tika.config.loader.TikaLoader; -import org.apache.tika.io.TikaInputStream; -import org.apache.tika.metadata.Metadata; -import org.apache.tika.parser.CompositeParser; -import org.apache.tika.parser.ParseContext; -import org.apache.tika.parser.Parser; -import org.apache.tika.parser.ParserDecorator; -import org.apache.tika.parser.RegexCaptureParser; - -public class ExternalParserTest extends TikaTest { - - @Test - public void testConfigRegexCaptureParser() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{ - "file", "--version" - })); - TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557.json")); - CompositeParser p = (CompositeParser) loader.get(Parser.class); - assertEquals(1, p.getAllComponentParsers().size()); - Parser parser = p.getAllComponentParsers().get(0); - // When _mime-include is used, the parser is wrapped in a ParserDecorator - ExternalParser externalParser = (parser instanceof ParserDecorator) - ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() - : (ExternalParser) parser; - - Parser outputParser = externalParser.getOutputParser(); - assertEquals(RegexCaptureParser.class, outputParser.getClass()); - - Metadata m = new Metadata(); - ContentHandler contentHandler = new DefaultHandler(); - String output = "Something\n" + - "Title: the quick brown fox\n" + - "Author: jumped over\n" + - "Created: 10/20/2024"; - try (TikaInputStream tis = TikaInputStream.get(output.getBytes(StandardCharsets.UTF_8))) { - outputParser.parse(tis, contentHandler, m, new ParseContext()); - } - assertEquals("the quick brown fox", m.get("title")); - } - - @Test - public void testConfigBasic() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"file", "--version"})); - TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-no-output-parser.json")); - CompositeParser p = (CompositeParser) loader.get(Parser.class); - assertEquals(1, p.getAllComponentParsers().size()); - Parser parser = p.getAllComponentParsers().get(0); - // When _mime-include is used, the parser is wrapped in a ParserDecorator - ExternalParser externalParser = (parser instanceof ParserDecorator) - ? (ExternalParser) ((ParserDecorator) parser).getWrappedParser() - : (ExternalParser) parser; - - XMLResult xmlResult = getXML("example.xml", externalParser); - assertContains("<body>text/xml</body>", xmlResult.xml.replaceAll("[\r\n]", "")); - } - - @Test - public void testExifTool() throws Exception { - assumeTrue(org.apache.tika.parser.external.ExternalParser.check(new String[]{"exiftool", - "-ver"})); - TikaLoader loader = TikaLoader.load(getConfigPath(getClass(), "TIKA-3557-exiftool-example.json")); - Parser p = loader.loadAutoDetectParser(); - //this was the smallest pdf we had - List<Metadata> metadataList = getRecursiveMetadata("testOverlappingText.pdf", p); - assertEquals(1, metadataList.size()); - Metadata m = metadataList.get(0); - assertEquals("application/pdf", m.get("mime")); - assertEquals("1", m.get("pages")); - assertEquals("1.4", m.get("pdf:version")); - } -} diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json index 683d6b0942..8e61008654 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json +++ b/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json @@ -4,10 +4,9 @@ "external-parser": { "_mime-include": ["application/octet-stream"], "commandLine": ["exiftool", "${INPUT_FILE}"], - "checkExitValues": [0], "outputParser": { "regex-capture-parser": { - "matchMap": { + "captureMap": { "mime": "^MIME Type\\s+: ([^\\r\\n]+)", "pages": "^Page Count\\s+: ([^\\r\\n]+)", "pdf:version": "^PDF Version\\s+: ([^\\r\\n]+)" diff --git a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json similarity index 60% copy from tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json copy to tika-serialization/src/test/resources/configs/external-parser-exiftool.json index 683d6b0942..a0bffc78e9 100644 --- a/tika-serialization/src/test/resources/configs/TIKA-3557-exiftool-example.json +++ b/tika-serialization/src/test/resources/configs/external-parser-exiftool.json @@ -2,12 +2,18 @@ "parsers": [ { "external-parser": { - "_mime-include": ["application/octet-stream"], + "_mime-include": [ + "video/avi", + "video/mpeg", + "video/x-msvideo", + "video/mp4" + ], "commandLine": ["exiftool", "${INPUT_FILE}"], - "checkExitValues": [0], + "checkCommandLine": ["exiftool", "-ver"], + "checkErrorCodes": [126, 127], "outputParser": { "regex-capture-parser": { - "matchMap": { + "captureMap": { "mime": "^MIME Type\\s+: ([^\\r\\n]+)", "pages": "^Page Count\\s+: ([^\\r\\n]+)", "pdf:version": "^PDF Version\\s+: ([^\\r\\n]+)" diff --git a/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json new file mode 100644 index 0000000000..0aa55a5fbc --- /dev/null +++ b/tika-serialization/src/test/resources/configs/external-parser-ffmpeg.json @@ -0,0 +1,34 @@ +{ + "parsers": [ + { + "external-parser": { + "_mime-include": [ + "video/avi", + "video/mpeg", + "video/x-msvideo" + ], + "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"], + "checkCommandLine": ["ffmpeg", "-version"], + "checkErrorCodes": [126, 127], + "returnStderr": true, + "maxStdErr": 20000, + "stderrParser": { + "regex-capture-parser": { + "captureMap": { + "xmpDM:audioSampleRate": "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*", + "xmpDM:audioChannelType": "\\s*Stream.*:.+Audio:.*\\d+\\s+Hz,\\s+(\\d{1,2})\\s+channels.*", + "xmpDM:audioCompressor": "\\s*Stream.*:.+Audio:\\s+([A-Za-z0-9_\\(\\)/\\[\\] ]+),.*", + "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*", + "xmpDM:fileDataRate": "\\s*Duration:.*,\\s*bitrate:\\s+([0-9A-Za-z/ ]+).*", + "xmpDM:videoColorSpace": "\\s*Stream.*:\\s+Video:\\s+[A-Za-z0-9\\(\\)/ ]+,\\s+([A-Za-z0-9\\(\\) ,]+),\\s+[0-9x]+,.*", + "xmpDM:videoCompressor": "\\s*Stream.*:\\s+Video:\\s+([A-Za-z0-9\\(\\)/ ]+),.*", + "xmpDM:videoFrameRate": "\\s*Stream.*:\\s+Video:.*,\\s+([0-9]+)\\s+fps,.*", + "encoder": "\\s*encoder\\s*\\:\\s*(\\w+).*", + "videoResolution": "\\s*Stream.*:\\s+Video:.*,\\s+([0-9x]+),.*" + } + } + } + } + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/external-parser-multi.json b/tika-serialization/src/test/resources/configs/external-parser-multi.json new file mode 100644 index 0000000000..f1d5c5f59a --- /dev/null +++ b/tika-serialization/src/test/resources/configs/external-parser-multi.json @@ -0,0 +1,45 @@ +{ + "parsers": [ + { + "external-parser": { + "_mime-include": [ + "video/avi", + "video/mpeg", + "video/x-msvideo" + ], + "commandLine": ["ffmpeg", "-i", "${INPUT_FILE}"], + "checkCommandLine": ["ffmpeg", "-version"], + "checkErrorCodes": [126, 127], + "returnStderr": true, + "maxStdErr": 20000, + "stderrParser": { + "regex-capture-parser": { + "captureMap": { + "xmpDM:duration": "\\s*Duration:\\s*([0-9:\\.]+),.*", + "xmpDM:audioSampleRate": "\\s*Stream.*:.+Audio:.*,\\s+(\\d+)\\s+Hz,.*" + } + } + } + } + }, + { + "external-parser": { + "_mime-include": [ + "application/pdf" + ], + "commandLine": ["exiftool", "${INPUT_FILE}"], + "checkCommandLine": ["exiftool", "-ver"], + "checkErrorCodes": [126, 127], + "outputParser": { + "regex-capture-parser": { + "captureMap": { + "exiftool:MIMEType": "^MIME Type\\s+: ([^\\r\\n]+)", + "exiftool:PageCount": "^Page Count\\s+: ([^\\r\\n]+)", + "exiftool:PDFVersion": "^PDF Version\\s+: ([^\\r\\n]+)" + } + } + } + } + } + ] +} diff --git a/tika-serialization/src/test/resources/configs/external-parser-sox.json b/tika-serialization/src/test/resources/configs/external-parser-sox.json new file mode 100644 index 0000000000..b8ae108d71 --- /dev/null +++ b/tika-serialization/src/test/resources/configs/external-parser-sox.json @@ -0,0 +1,36 @@ +{ + "parsers": [ + { + "external-parser": { + "_mime-include": [ + "audio/mpeg", + "audio/mp3", + "audio/wav", + "audio/x-wav", + "audio/ogg", + "audio/vorbis", + "audio/mp4" + ], + "commandLine": ["sox", "--info", "${INPUT_FILE}"], + "checkCommandLine": ["sox", "--version"], + "checkErrorCodes": [126, 127], + "returnStderr": true, + "maxStdErr": 10000, + "stderrParser": { + "regex-capture-parser": { + "captureMap": { + "xmpDM:audioChannelType": "\\s*Channels.*:\\s+(\\d+)\\s*", + "xmpDM:audioSampleRate": "\\s*Sample Rate.*:\\s+(\\d+)\\s*", + "xmpDM:audioSampleType": "\\s*Precision.*:\\s+([\\d\\w-]+)\\s*", + "xmpDM:duration": "\\s*Duration.*:\\s+([\\d:\\.]+)\\s*", + "File Size": "\\s*File Size.*:\\s+([\\d\\w]+)\\s*", + "xmpDM:fileDataRate": "\\s*Bit Rate.*:\\s+([\\d\\w]+)\\s*", + "Sample Encoding": "\\s*Sample Encoding.*:\\s+(.*)\\s*", + "xmpDM:logComment": "\\s*Comment.*:\\s+(.*)\\s*" + } + } + } + } + } + ] +} diff --git a/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser b/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser deleted file mode 100644 index 37f87a4595..0000000000 --- a/tika-server/tika-server-standard/src/main/resources/META-INF/services/org.apache.tika.parser.Parser +++ /dev/null @@ -1,16 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -org.apache.tika.parser.external.CompositeExternalParser \ No newline at end of file diff --git a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java index 1eae7ba45a..e437aa4823 100644 --- a/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java +++ b/tika-server/tika-server-standard/src/test/java/org/apache/tika/server/standard/TikaParsersTest.java @@ -144,7 +144,7 @@ public class TikaParsersTest extends CXFTestBase { assertEquals("org.apache.tika.parser.CompositeParser", json.get("name")); assertEquals(Boolean.TRUE, json.get("composite")); - // At least 20 child parsers which aren't composite, except for CompositeExternalParser + // At least 20 child parsers which aren't composite List<Object> wrapper = (List) json.get("children"); Map<String, Object> firstItem = (Map) wrapper.get(0); List<Object> children = (List) firstItem.get("children"); @@ -191,7 +191,7 @@ public class TikaParsersTest extends CXFTestBase { assertEquals(true, hasOOXML); assertEquals(true, hasZip); assertTrue(nonComposite > 20); - assertTrue(composite == 0 || composite == 1); // if CompositeExternalParser is available it will be 1 + assertEquals(0, composite); } } }
