This is an automated email from the ASF dual-hosted git repository. jamesnetherton pushed a commit to branch main in repository https://gitbox.apache.org/repos/asf/camel-quarkus.git
commit 43641cdbf69f19e9bdd08185dc0d229a89e4c198 Author: James Netherton <jamesnether...@gmail.com> AuthorDate: Tue Mar 8 10:50:20 2022 +0000 Work around Tika version incompatibilities between Quarkus Tika & Camel Tika #3599 --- extensions/tika/runtime/pom.xml | 11 ++-- .../camel/quarkus/component/tika/TikaRecorder.java | 33 +++++++++- .../tika/graalvm/TikaProducerSubstitutions.java | 77 ++++++++++++++++++++++ 3 files changed, 115 insertions(+), 6 deletions(-) diff --git a/extensions/tika/runtime/pom.xml b/extensions/tika/runtime/pom.xml index 3470945..74ae8a2 100644 --- a/extensions/tika/runtime/pom.xml +++ b/extensions/tika/runtime/pom.xml @@ -58,11 +58,7 @@ <exclusions> <exclusion> <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - </exclusion> - <exclusion> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> + <artifactId>*</artifactId> </exclusion> </exclusions> </dependency> @@ -74,6 +70,11 @@ <groupId>io.quarkiverse.tika</groupId> <artifactId>quarkus-tika</artifactId> </dependency> + <dependency> + <groupId>org.graalvm.nativeimage</groupId> + <artifactId>svm</artifactId> + <scope>provided</scope> + </dependency> </dependencies> <build> diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java index 6d6760b..c5ea87f 100644 --- a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java +++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/TikaRecorder.java @@ -18,9 +18,14 @@ package org.apache.camel.quarkus.component.tika; import java.io.IOException; import java.io.InputStream; +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; import java.util.Collections; import java.util.Set; +import javax.xml.transform.TransformerConfigurationException; + import org.xml.sax.ContentHandler; import org.xml.sax.SAXException; @@ -36,12 +41,14 @@ import org.apache.camel.Producer; import org.apache.camel.component.tika.TikaComponent; import org.apache.camel.component.tika.TikaConfiguration; import org.apache.camel.component.tika.TikaEndpoint; +import org.apache.camel.component.tika.TikaParseOutputFormat; import org.apache.camel.component.tika.TikaProducer; import org.apache.tika.exception.TikaException; import org.apache.tika.metadata.Metadata; import org.apache.tika.mime.MediaType; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; +import org.apache.tika.parser.html.BoilerpipeContentHandler; @Recorder public class TikaRecorder { @@ -78,7 +85,7 @@ public class TikaRecorder { @Override public Producer createProducer() throws Exception { TikaParser tikaParser = tikaParserProducer.tikaParser(); - return new TikaProducer(this, new Parser() { + return new QuarkusTikaProducer(this, new Parser() { @Override public Set<MediaType> getSupportedTypes(ParseContext parseContext) { return Collections.emptySet(); @@ -99,4 +106,28 @@ public class TikaRecorder { } } + // TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned + // https://github.com/apache/camel-quarkus/issues/3599 + static class QuarkusTikaProducer extends TikaProducer { + + public QuarkusTikaProducer(TikaEndpoint endpoint) { + super(endpoint); + } + + public QuarkusTikaProducer(TikaEndpoint endpoint, Parser parser) { + super(endpoint, parser); + } + + @Override + protected ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream) + throws TransformerConfigurationException, UnsupportedEncodingException { + TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat(); + if (outputFormat.equals(TikaParseOutputFormat.textMain)) { + return new BoilerpipeContentHandler( + new OutputStreamWriter(outputStream, configuration.getTikaParseOutputEncoding())); + } + return super.getContentHandler(configuration, outputStream); + } + } + } diff --git a/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java new file mode 100644 index 0000000..343edae --- /dev/null +++ b/extensions/tika/runtime/src/main/java/org/apache/camel/quarkus/component/tika/graalvm/TikaProducerSubstitutions.java @@ -0,0 +1,77 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.camel.quarkus.component.tika.graalvm; + +import java.io.OutputStream; +import java.io.OutputStreamWriter; +import java.io.UnsupportedEncodingException; + +import javax.xml.transform.TransformerConfigurationException; +import javax.xml.transform.sax.TransformerHandler; + +import org.xml.sax.ContentHandler; + +import com.oracle.svm.core.annotate.Alias; +import com.oracle.svm.core.annotate.Substitute; +import com.oracle.svm.core.annotate.TargetClass; +import org.apache.camel.component.tika.TikaConfiguration; +import org.apache.camel.component.tika.TikaParseOutputFormat; +import org.apache.camel.component.tika.TikaProducer; +import org.apache.tika.sax.BodyContentHandler; +import org.apache.tika.sax.ExpandedTitleContentHandler; + +// TODO: Remove this when Camel Tika & Quarkus Tika versions are aligned +// https://github.com/apache/camel-quarkus/issues/3599 +@TargetClass(TikaProducer.class) +public final class TikaProducerSubstitutions { + + @Alias + private String encoding; + + // Removes problematic textMain switch case since it's covered in the custom TikaProducer in TikaRecorder + @Substitute + private ContentHandler getContentHandler(TikaConfiguration configuration, OutputStream outputStream) + throws TransformerConfigurationException, UnsupportedEncodingException { + + ContentHandler result = null; + + TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat(); + switch (outputFormat) { + case xml: + result = getTransformerHandler(outputStream, "xml", true); + break; + case text: + result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding)); + break; + case html: + result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true)); + break; + default: + throw new IllegalArgumentException( + String.format("Unknown format %s", configuration.getTikaParseOutputFormat())); + } + return result; + } + + @Alias + private TransformerHandler getTransformerHandler( + OutputStream output, String method, + boolean prettyPrint) + throws TransformerConfigurationException, UnsupportedEncodingException { + return null; + } +}