CAMEL-10740 - Code cleanup and encoding support. Project: http://git-wip-us.apache.org/repos/asf/camel/repo Commit: http://git-wip-us.apache.org/repos/asf/camel/commit/c73068e7 Tree: http://git-wip-us.apache.org/repos/asf/camel/tree/c73068e7 Diff: http://git-wip-us.apache.org/repos/asf/camel/diff/c73068e7
Branch: refs/heads/master Commit: c73068e7d42f5f8a83b218463389383d6fb26837 Parents: 17c83ba Author: Bob Paulin <b...@bobpaulin.com> Authored: Sat Jan 28 23:58:12 2017 -0600 Committer: Claus Ibsen <davscl...@apache.org> Committed: Sun Jan 29 17:06:27 2017 +0100 ---------------------------------------------------------------------- components/camel-tika/pom.xml | 141 +++++++++---------- .../src/main/docs/tika-component.adoc | 8 +- .../camel/component/tika/TikaConfiguration.java | 26 +++- .../camel/component/tika/TikaEndpoint.java | 2 +- .../camel/component/tika/TikaProducer.java | 38 ++--- .../camel/component/tika/TikaParseTest.java | 67 ++++++++- .../src/test/resources/testOpenOffice2.odt | Bin 0 -> 26460 bytes 7 files changed, 175 insertions(+), 107 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/pom.xml ---------------------------------------------------------------------- diff --git a/components/camel-tika/pom.xml b/components/camel-tika/pom.xml index 86f0131..6233b9f 100644 --- a/components/camel-tika/pom.xml +++ b/components/camel-tika/pom.xml @@ -15,80 +15,79 @@ See the License for the specific language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd"> - <modelVersion>4.0.0</modelVersion> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.camel</groupId> + <artifactId>components</artifactId> + <version>2.19.0-SNAPSHOT</version> + </parent> - <parent> - <groupId>org.apache.camel</groupId> - <artifactId>components</artifactId> - <version>2.19.0-SNAPSHOT</version> - </parent> + <artifactId>camel-tika</artifactId> + <packaging>jar</packaging> + <name>Camel :: Tika</name> + <description>This component integrates with Apache Tika to extract content and metadata from thousands of file types.</description> - <artifactId>camel-tika</artifactId> - <packaging>jar</packaging> - <name>Camel :: Tika</name> - <description>This component integrates with Apache Tika to extract content and metadata from thousands of file types.</description> + <properties> + <camel.osgi.export.pkg>org.apache.camel.component.tika.*</camel.osgi.export.pkg> + <camel.osgi.export.service>org.apache.camel.spi.ComponentResolver;component=tika</camel.osgi.export.service> + </properties> - <properties> - <camel.osgi.export.pkg>org.apache.camel.component.tika.*</camel.osgi.export.pkg> - <camel.osgi.export.service>org.apache.camel.spi.ComponentResolver;component=tika</camel.osgi.export.service> - </properties> - - <dependencies> - - <dependency> - <groupId>org.apache.camel</groupId> - <artifactId>camel-core</artifactId> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-core</artifactId> - <version>${tika-version}</version> - </dependency> - <dependency> - <groupId>org.apache.tika</groupId> - <artifactId>tika-parsers</artifactId> - <version>${tika-version}</version> - </dependency> - <!-- test dependencies --> - <dependency> - <groupId>org.apache.camel</groupId> - <artifactId>camel-test-spring</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-api</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-core</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.logging.log4j</groupId> - <artifactId>log4j-slf4j-impl</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <scope>test</scope> - </dependency> - <dependency> - <groupId>commons-io</groupId> - <artifactId>commons-io</artifactId> - <version>${commons-io-version}</version> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.hamcrest</groupId> - <artifactId>java-hamcrest</artifactId> - <version>${hamcrest-version}</version> - <scope>test</scope> - </dependency> - </dependencies> + <dependencies> + <dependency> + <groupId>org.apache.camel</groupId> + <artifactId>camel-core</artifactId> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-core</artifactId> + <version>${tika-version}</version> + </dependency> + <dependency> + <groupId>org.apache.tika</groupId> + <artifactId>tika-parsers</artifactId> + <version>${tika-version}</version> + </dependency> + <!-- test dependencies --> + <dependency> + <groupId>org.apache.camel</groupId> + <artifactId>camel-test-spring</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-api</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-core</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.logging.log4j</groupId> + <artifactId>log4j-slf4j-impl</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <scope>test</scope> + </dependency> + <dependency> + <groupId>commons-io</groupId> + <artifactId>commons-io</artifactId> + <version>${commons-io-version}</version> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.hamcrest</groupId> + <artifactId>java-hamcrest</artifactId> + <version>${hamcrest-version}</version> + <scope>test</scope> + </dependency> + </dependencies> </project> http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/docs/tika-component.adoc ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/main/docs/tika-component.adoc b/components/camel-tika/src/main/docs/tika-component.adoc index 7049a59..f077452 100644 --- a/components/camel-tika/src/main/docs/tika-component.adoc +++ b/components/camel-tika/src/main/docs/tika-component.adoc @@ -41,7 +41,7 @@ The Tika component has no options. // endpoint options: START -The Tika component supports 5 endpoint options which are listed below: +The Tika component supports 6 endpoint options which are listed below: {% raw %} [width="100%",cols="2,1,1m,1m,5",options="header"] @@ -49,8 +49,9 @@ The Tika component supports 5 endpoint options which are listed below: | Name | Group | Default | Java Type | Description | operation | producer | | TikaOperation | *Required* Tika Operation. parse or detect | tikaConfig | producer | | TikaConfig | Tika Config -| tikaConfigUri | producer | | String | Tika Config Uri -| tikaParseOutputFormat | producer | xml | TikaParseOutputFormat | Tika Output Format. Supported output formats are xml html text textMain +| tikaConfigUri | producer | | String | Tika Config Uri: The URI of tika-config.xml +| tikaParseOutputEncoding | producer | | String | Tika Parse Output Encoding - Used to specify the character encoding of the parsed output. Defaults to Charset.defaultCharset() . +| tikaParseOutputFormat | producer | xml | TikaParseOutputFormat | Tika Output Format. Supported output formats. xml: Returns Parsed Content as XML. html: Returns Parsed Content as HTML. text: Returns Parsed Content as Text. textMain: Uses the boilerpipe library to automatically extract the main content from a web page. | synchronous | advanced | false | boolean | Sets whether synchronous processing should be strictly used or Camel is allowed to use asynchronous processing (if supported). |======================================================================= {% endraw %} @@ -61,7 +62,6 @@ The Tika component supports 5 endpoint options which are listed below: [width="100%",cols="10%,90%",options="header",] |======================================================================= |Header |Description -|TikaXXXX | Any Tika Metadata Header is converted to a Camel Header with Prefix Tika |======================================================================= ### To Detect a file's MIME Type http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java index 051ad2a..33542c0 100644 --- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java +++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaConfiguration.java @@ -17,6 +17,7 @@ package org.apache.camel.component.tika; import java.io.IOException; +import java.nio.charset.Charset; import org.xml.sax.SAXException; @@ -36,6 +37,8 @@ public class TikaConfiguration { private TikaOperation operation; @UriParam(defaultValue = "xml") private TikaParseOutputFormat tikaParseOutputFormat = TikaParseOutputFormat.xml; + @UriParam(description = "Tika Parse Output Encoding") + private String tikaParseOutputEncoding = Charset.defaultCharset().name(); @UriParam(description = "Tika Config") private TikaConfig tikaConfig = TikaConfig.getDefaultConfig(); @UriParam(description = "Tika Config Url") @@ -64,12 +67,31 @@ public class TikaConfiguration { /** * - * Tika Output Format. Supported output formats are xml, html, text, textMain + * Tika Output Format. Supported output formats. + * <ul> + * <li>xml: Returns Parsed Content as XML. </li> + * <li>html: Returns Parsed Content as HTML. </li> + * <li>text: Returns Parsed Content as Text. </li> + * <li>textMain: Uses the <a href="http://code.google.com/p/boilerpipe/">boilerpipe</a> library to automatically extract the main content from a web page. </li> + * </ul> * */ public void setTikaParseOutputFormat(TikaParseOutputFormat tikaParseOutputFormat) { this.tikaParseOutputFormat = tikaParseOutputFormat; } + + public String getTikaParseOutputEncoding() { + return tikaParseOutputEncoding; + } + + /** + * Tika Parse Output Encoding - Used to specify the character encoding of the parsed output. + * Defaults to Charset.defaultCharset() . + * + */ + public void setTikaParseOutputEncoding(String tikaParseOutputEncoding) { + this.tikaParseOutputEncoding = tikaParseOutputEncoding; + } public TikaConfig getTikaConfig() { return tikaConfig; @@ -90,7 +112,7 @@ public class TikaConfiguration { /** * - * Tika Config Uri + * Tika Config Uri: The URI of tika-config.xml * */ public void setTikaConfigUri(String tikaConfigUri) throws TikaException, IOException, SAXException { http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java index cb8fbdd..a1701d3 100644 --- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java +++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaEndpoint.java @@ -24,7 +24,7 @@ import org.apache.camel.impl.DefaultEndpoint; import org.apache.camel.spi.UriEndpoint; import org.apache.camel.spi.UriParam; -@UriEndpoint(scheme = "tika", title = "Tika", syntax = "tika:operation", producerOnly = true, label = "tika") +@UriEndpoint(scheme = "tika", title = "Tika", syntax = "tika:operation", producerOnly = true, label = "transformation") public class TikaEndpoint extends DefaultEndpoint { @UriParam http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java index 1e0d9ca..309df98 100644 --- a/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java +++ b/components/camel-tika/src/main/java/org/apache/camel/component/tika/TikaProducer.java @@ -22,10 +22,6 @@ import java.io.InputStream; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.UnsupportedEncodingException; -import java.io.Writer; -import java.nio.charset.Charset; -import java.nio.charset.StandardCharsets; -import java.util.Locale; import javax.xml.transform.OutputKeys; import javax.xml.transform.TransformerConfigurationException; @@ -57,10 +53,13 @@ public class TikaProducer extends DefaultProducer { private final Parser parser; private final Detector detector; + + private final String encoding; public TikaProducer(TikaEndpoint endpoint) { super(endpoint); this.tikaConfiguration = endpoint.getTikaConfiguration(); + this.encoding = this.tikaConfiguration.getTikaParseOutputEncoding(); TikaConfig config = this.tikaConfiguration.getTikaConfig(); this.parser = new AutoDetectParser(config); this.detector = config.getDetector(); @@ -111,7 +110,7 @@ public class TikaProducer extends DefaultProducer { private void convertMetadataToHeaders(Metadata metadata, Exchange exchange) { if (metadata != null) { for (String metaname : metadata.names()) { - exchange.getIn().setHeader("Tika" + metaname, metadata.get(metaname)); + exchange.getIn().setHeader(metaname, metadata.get(metaname)); } } } @@ -122,19 +121,18 @@ public class TikaProducer extends DefaultProducer { ContentHandler result = null; TikaParseOutputFormat outputFormat = configuration.getTikaParseOutputFormat(); - String encoding = Charset.defaultCharset().name(); switch (outputFormat) { case xml: - result = getTransformerHandler(outputStream, "xml", encoding, true); + result = getTransformerHandler(outputStream, "xml", true); break; case text: - result = new BodyContentHandler(outputStream); + result = new BodyContentHandler(new OutputStreamWriter(outputStream, this.encoding)); break; case textMain: - result = new BoilerpipeContentHandler(getOutputWriter(outputStream, encoding)); + result = new BoilerpipeContentHandler(new OutputStreamWriter(outputStream, this.encoding)); break; case html: - result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", encoding, true)); + result = new ExpandedTitleContentHandler(getTransformerHandler(outputStream, "html", true)); break; default: throw new IllegalArgumentException( @@ -143,26 +141,16 @@ public class TikaProducer extends DefaultProducer { return result; } - private TransformerHandler getTransformerHandler(OutputStream output, String method, String encoding, - boolean prettyPrint) throws TransformerConfigurationException { + private TransformerHandler getTransformerHandler(OutputStream output, String method, + boolean prettyPrint) throws TransformerConfigurationException, UnsupportedEncodingException { SAXTransformerFactory factory = (SAXTransformerFactory) SAXTransformerFactory.newInstance(); TransformerHandler handler = factory.newTransformerHandler(); handler.getTransformer().setOutputProperty(OutputKeys.METHOD, method); handler.getTransformer().setOutputProperty(OutputKeys.INDENT, prettyPrint ? "yes" : "no"); - if (encoding != null) { - handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, encoding); + if (this.encoding != null) { + handler.getTransformer().setOutputProperty(OutputKeys.ENCODING, this.encoding); } - handler.setResult(new StreamResult(output)); + handler.setResult(new StreamResult(new OutputStreamWriter(output, this.encoding))); return handler; } - - private Writer getOutputWriter(OutputStream output, String encoding) throws UnsupportedEncodingException { - if (encoding != null) { - return new OutputStreamWriter(output, encoding); - } else if (System.getProperty("os.name").toLowerCase(Locale.ROOT).startsWith("mac os x")) { - return new OutputStreamWriter(output, StandardCharsets.UTF_8); - } else { - return new OutputStreamWriter(output, Charset.defaultCharset()); - } - } } http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java b/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java index dc6d97e..1db2a8d 100644 --- a/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java +++ b/components/camel-tika/src/test/java/org/apache/camel/component/tika/TikaParseTest.java @@ -16,7 +16,15 @@ */ package org.apache.camel.component.tika; +import java.io.ByteArrayInputStream; import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Reader; +import java.io.StringReader; +import java.io.UnsupportedEncodingException; +import java.nio.charset.Charset; +import java.nio.charset.StandardCharsets; import java.util.Map; import org.apache.camel.EndpointInject; @@ -26,7 +34,11 @@ import org.apache.camel.builder.RouteBuilder; import org.apache.camel.component.mock.MockEndpoint; import org.apache.camel.impl.JndiRegistry; import org.apache.camel.test.junit4.CamelTestSupport; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.txt.UniversalEncodingDetector; import org.junit.Test; +import org.mozilla.universalchardet.UniversalDetector; + import static org.hamcrest.Matchers.*; public class TikaParseTest extends CamelTestSupport { @@ -48,8 +60,54 @@ public class TikaParseTest extends CamelTestSupport { Object body = exchange.getIn().getBody(String.class); Map<String, Object> headerMap = exchange.getIn().getHeaders(); assertThat(body, instanceOf(String.class)); + + Charset detectedCharset = null; + try { + InputStream bodyIs = new ByteArrayInputStream(((String)body).getBytes()); + UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector(); + detectedCharset = encodingDetector.detect(bodyIs, new Metadata()); + } catch (IOException e1) { + fail(); + } + + + assertThat(detectedCharset.name(), startsWith(Charset.defaultCharset().name())); + assertThat((String) body, containsString("test")); - assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword")); + assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword")); + return true; + } + }); + resultEndpoint.assertIsSatisfied(); + } + + @Test + public void testDocumentParseWithEncoding() throws Exception { + + File document = new File("src/test/resources/testOpenOffice2.odt"); + template.sendBody("direct:start4", document); + + resultEndpoint.setExpectedMessageCount(1); + + resultEndpoint.expectedMessagesMatches(new Predicate() { + @Override + public boolean matches(Exchange exchange) { + Object body = exchange.getIn().getBody(String.class); + Map<String, Object> headerMap = exchange.getIn().getHeaders(); + assertThat(body, instanceOf(String.class)); + + Charset detectedCharset = null; + try { + InputStream bodyIs = new ByteArrayInputStream(((String)body).getBytes(StandardCharsets.UTF_16)); + UniversalEncodingDetector encodingDetector = new UniversalEncodingDetector(); + detectedCharset = encodingDetector.detect(bodyIs, new Metadata()); + } catch (IOException e1) { + fail(); + } + + + assertThat(detectedCharset.name(), startsWith(StandardCharsets.UTF_16.name())); + assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/vnd.oasis.opendocument.text")); return true; } }); @@ -70,7 +128,7 @@ public class TikaParseTest extends CamelTestSupport { Map<String, Object> headerMap = exchange.getIn().getHeaders(); assertThat(body, instanceOf(String.class)); assertThat((String) body, containsString("<body/>")); - assertThat(headerMap.get("TikaContent-Type"), equalTo("image/gif")); + assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("image/gif")); return true; } }); @@ -91,7 +149,7 @@ public class TikaParseTest extends CamelTestSupport { Map<String, Object> headerMap = exchange.getIn().getHeaders(); assertThat(body, instanceOf(String.class)); assertThat((String) body, containsString("<body/>")); - assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword")); + assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword")); return true; } }); @@ -112,7 +170,7 @@ public class TikaParseTest extends CamelTestSupport { Map<String, Object> headerMap = exchange.getIn().getHeaders(); assertThat(body, instanceOf(String.class)); assertThat((String) body, containsString("<body/>")); - assertThat(headerMap.get("TikaContent-Type"), equalTo("application/msword")); + assertThat(headerMap.get(Exchange.CONTENT_TYPE), equalTo("application/msword")); return true; } }); @@ -128,6 +186,7 @@ public class TikaParseTest extends CamelTestSupport { from("direct:start2").to("tika:parse?tikaConfigUri=src/test/resources/tika-empty.xml") .to("mock:result"); from("direct:start3").to("tika:parse?tikaConfig=#testConfig").to("mock:result"); + from("direct:start4").to("tika:parse?tikaParseOutputEncoding=" + StandardCharsets.UTF_16.name()).to("mock:result"); } }; } http://git-wip-us.apache.org/repos/asf/camel/blob/c73068e7/components/camel-tika/src/test/resources/testOpenOffice2.odt ---------------------------------------------------------------------- diff --git a/components/camel-tika/src/test/resources/testOpenOffice2.odt b/components/camel-tika/src/test/resources/testOpenOffice2.odt new file mode 100644 index 0000000..0b1bb11 Binary files /dev/null and b/components/camel-tika/src/test/resources/testOpenOffice2.odt differ