This is an automated email from the ASF dual-hosted git repository. davsclaus pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/camel.git
The following commit(s) were added to refs/heads/master by this push: new 0e36d91 Bindy unicode patch (#3552) 0e36d91 is described below commit 0e36d91863957ff0e5d2fc7c92726c0243bfddd9 Author: mgr-lhm <externer.dl.greul...@muenchen.de> AuthorDate: Tue Feb 11 10:11:28 2020 +0100 Bindy unicode patch (#3552) CAMEL-14521: Added Unicode support to bindy fixed length format via icu4j. --- components/camel-bindy/pom.xml | 8 +- .../dataformat/bindy/BindyFixedLengthFactory.java | 17 +- .../camel/dataformat/bindy/UnicodeHelper.java | 196 +++++++++++++++++ .../bindy/annotation/FixedLengthRecord.java | 5 + .../camel/dataformat/bindy/UnicodeHelperTest.java | 232 +++++++++++++++++++++ parent/pom.xml | 1 + 6 files changed, 454 insertions(+), 5 deletions(-) diff --git a/components/camel-bindy/pom.xml b/components/camel-bindy/pom.xml index 33aa833..e0f84ba 100644 --- a/components/camel-bindy/pom.xml +++ b/components/camel-bindy/pom.xml @@ -31,15 +31,17 @@ <name>Camel :: Bindy</name> <description>Camel Bindy data format support</description> - <properties> - </properties> - <dependencies> <dependency> <groupId>org.apache.camel</groupId> <artifactId>camel-support</artifactId> </dependency> + <dependency> + <groupId>com.ibm.icu</groupId> + <artifactId>icu4j</artifactId> + <version>${icu4j-version}</version> + </dependency> <!-- testing --> <dependency> diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java index f14f4d5..f5100ff 100644 --- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java +++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/BindyFixedLengthFactory.java @@ -69,6 +69,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin private int recordLength; private boolean ignoreTrailingChars; private boolean ignoreMissingChars; + private boolean countGrapheme; private Class<?> header; private Class<?> footer; @@ -161,7 +162,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin // noop } - public void bind(CamelContext camelContext, String record, Map<String, Object> model, int line) throws Exception { + public void bind(CamelContext camelContext, String recordStr, Map<String, Object> model, int line) throws Exception { int pos = 1; int counterMandatoryFields = 0; @@ -171,6 +172,8 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin int length; String delimiter; Field field; + + final UnicodeHelper record = new UnicodeHelper(recordStr, (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS); // Iterate through the list of positions // defined in the @DataField @@ -217,7 +220,7 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin } offset += length; } else if (!delimiter.equals("")) { - String tempToken = record.substring(offset - 1, record.length()); + final UnicodeHelper tempToken = new UnicodeHelper(record.substring(offset - 1, record.length()), (this.countGrapheme) ? UnicodeHelper.Method.GRAPHEME : UnicodeHelper.Method.CODEPOINTS); token = tempToken.substring(0, tempToken.indexOf(delimiter)); // include the delimiter in the offset calculation offset += token.length() + 1; @@ -604,6 +607,9 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin ignoreMissingChars = record.ignoreMissingChars(); LOG.debug("Enable ignore missing chars: {}", ignoreMissingChars); + + countGrapheme = record.countGrapheme(); + LOG.debug("Enable grapheme counting instead of codepoints: {}", countGrapheme); } } @@ -712,4 +718,11 @@ public class BindyFixedLengthFactory extends BindyAbstractFactory implements Bin return ignoreMissingChars; } + /** + * Flag indicating whether graphemes or codepoints are counted. + */ + public boolean isCountGrapheme() { + return countGrapheme; + } + } diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java new file mode 100644 index 0000000..f55e4e2 --- /dev/null +++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/UnicodeHelper.java @@ -0,0 +1,196 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.camel.dataformat.bindy; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; +import java.util.stream.Collectors; + +import com.ibm.icu.text.BreakIterator; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * This class replicates the essential parts of the String class in order to aid + * proper work for Unicode chars in the presense of UTF-16. So for all operations + * please see {@link String} with the same signature. This class is equally immutable. + */ +public class UnicodeHelper implements Serializable { + /** + * Defines how length if a string is defined, i.e how chars are counted. + */ + public enum Method { + /** + * One "char" is one Unicode codepoint, which is the standard case. + */ + CODEPOINTS, + + /** + * One "char" is one graphem. + */ + GRAPHEME; + } + + private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelper.class); + + private String input; + + private List<Integer> splitted; + + private Method method; + + /** + * Create instance. + * + * @param input + * String, that is to be wrapped. + * @param method + * Method, that is used to determin "chars" of string. + */ + public UnicodeHelper(final String input, final Method method) { + this.input = input; + this.method = method; + this.splitted = null; + } + + /** + * For Serialization only! + */ + protected UnicodeHelper() { + // Empty + } + + /** + * @return + * Returns the method used to determining the string length. + */ + public Method getMethod() { + return method; + } + + /** + * @see String#substring(int) + */ + public String substring(final int beginIndex) { + split(); + + final int beginChar = splitted.get(beginIndex); + return input.substring(beginChar); + } + + /** + * @see String#substring(int, int) + */ + public String substring(final int beginIndex, final int endIndex) { + split(); + + final int beginChar = splitted.get(beginIndex); + final int endChar = splitted.get(endIndex); + return input.substring(beginChar, endChar); + } + + /** + * @see String#length() + */ + public int length() { + split(); + + return splitted.size() - 1; + } + + /** + * @see String#indexOf(String) + */ + public int indexOf(final String str) { + split(); + + final int tempIdx = input.indexOf(str); + if (tempIdx < 0) { + return tempIdx; + } + + for (int b = 0; b < splitted.size() - 1; b++) { + if (tempIdx == splitted.get(b)) { + for (int e = b + 1; e < splitted.size() - 1; e++) { + if (tempIdx + str.length() == splitted.get(e)) { + return b; + } + } + } + } + + final String cps = str.codePoints().mapToObj(cp -> String.format("0x%X", cp)).collect(Collectors.joining(",")); + throw new IllegalArgumentException("Given string (" + cps + ") is not a valid sequence of " + this.method + "s."); + } + + private void split() { + if (this.splitted != null) { + return; + } + + if (method.equals(Method.CODEPOINTS)) { + splitCodepoints(); + + } else /* (method.equals(Method.GRAPHEME)) */ { + splitGrapheme(); + } + + LOG.debug("\"{}\" is splitted into {} ({} {}).", input, splitted, splitted.size() - 1, method); + if (LOG.isTraceEnabled()) { + for (int i = 0; i < splitted.size() - 2; i++) { + LOG.trace("segment [{},{}[=\"{}\".", splitted.get(i), splitted.get(i + 1), input.substring(splitted.get(i), splitted.get(i + 1))); + } + } + } + + private void splitCodepoints() { + final List<Integer> result = new ArrayList<>(); + + int i = 0; + final int len = input.length(); + while (i < len) { + result.add(i); + i += (Character.codePointAt(input, i) > 0xffff) ? 2 : 1; + } + result.add(len); + + this.splitted = result; + } + + private void splitGrapheme() { + final List<Integer> result = new ArrayList<>(); + + // + // Caution: The BreakIterator of ICU lib (com.ibm.icu.text.BreakIterator; siehe Dependencies) ist used here, + // since the Java builtin one cannot handle modern unicode (Emojis with sex, skin colour, etc.) correctly. + // + final BreakIterator bit = BreakIterator.getCharacterInstance(); + bit.setText(input); + + result.add(bit.first()); + for (int end = bit.next(); end != BreakIterator.DONE; end = bit.next()) { + result.add(end); + } + this.splitted = result; + } + + @Override + public String toString() { + return "StringHelper [input=" + input + ", splitted=" + splitted + ", method=" + method + "]"; + } +} diff --git a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java index cff27fc..d8d93ae 100644 --- a/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java +++ b/components/camel-bindy/src/main/java/org/apache/camel/dataformat/bindy/annotation/FixedLengthRecord.java @@ -98,4 +98,9 @@ public @interface FixedLengthRecord { * Indicates whether too short lines will be ignored */ boolean ignoreMissingChars() default false; + + /** + * Indicates how chars are counted + */ + boolean countGrapheme() default true; } diff --git a/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java new file mode 100644 index 0000000..fad38e9 --- /dev/null +++ b/components/camel-bindy/src/test/java/org/apache/camel/dataformat/bindy/UnicodeHelperTest.java @@ -0,0 +1,232 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.camel.dataformat.bindy; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.util.Arrays; +import java.util.stream.Collectors; + +import org.apache.camel.dataformat.bindy.UnicodeHelper.Method; +import org.junit.Assert; +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + + +@SuppressWarnings("javadoc") +public class UnicodeHelperTest { + + private static final Logger LOG = LoggerFactory.getLogger(UnicodeHelperTest.class); + + private static final String UCSTR = cps2String( + 0x1f645, // FACE WITH NO GOOD GESTURE; Basiszeichen (Geste) + 0x1f3ff, // EMOJI MODIFIER FITZPATRICK TYPE-6; Hautfarbe für #1 + 0x200d, // ZERO WIDTH JOINER [ZWJ]; Steuerzeichen zum Verbinden + 0x2642, // MALE SIGN; Geschlecht für #1 + 0xfe0f // VARIATION SELECTOR-16 [VS16]; Darstellung als Piktogramm für #4 + ); + + @Test + public void testLengthCPs() { + final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS); + Assert.assertEquals(1, lh.length()); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS); + Assert.assertEquals(1, lh2.length()); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS); + Assert.assertEquals(5, lh3.length()); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS); + Assert.assertEquals(7, lh4.length()); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS); + Assert.assertEquals(3, lh5.length()); + } + + @Test + public void testLengthGrapheme() { + + final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME); + Assert.assertEquals(1, lh.length()); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME); + Assert.assertEquals(1, lh2.length()); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME); + Assert.assertEquals(1, lh3.length()); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME); + Assert.assertEquals(3, lh4.length()); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME); + Assert.assertEquals(2, lh5.length()); + } + + @Test + public void testSubstringCPs() throws FileNotFoundException, IOException { + + final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS); + Assert.assertEquals("a", lh.substring(0)); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS); + Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0)); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS); + Assert.assertEquals(UCSTR, lh3.substring(0)); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS); + Assert.assertEquals(UCSTR + "A", lh4.substring(1)); + Assert.assertEquals(new String(Character.toChars(0x1f3ff)) + "\u200d\u2642\ufe0fA", lh4.substring(2)); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS); + Assert.assertEquals("\u035fh", lh5.substring(1)); + } + + @Test + public void testSubstringGrapheme() throws FileNotFoundException, IOException { + + final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME); + Assert.assertEquals("a", lh.substring(0)); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME); + Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0)); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME); + Assert.assertEquals(UCSTR, lh3.substring(0)); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME); + Assert.assertEquals(UCSTR + "A", lh4.substring(1)); + Assert.assertEquals("A", lh4.substring(2)); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME); + Assert.assertEquals("h", lh5.substring(1)); + } + + @Test + public void testSubstringCPs2() { + + final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS); + Assert.assertEquals("a", lh.substring(0, 1)); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.CODEPOINTS); + Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1)); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.CODEPOINTS); + Assert.assertEquals(new String(Character.toChars(0x1f645)), lh3.substring(0, 1)); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.CODEPOINTS); + Assert.assertEquals("a", lh4.substring(0, 1)); + Assert.assertEquals(new String(Character.toChars(0x1f645)), lh4.substring(1, 2)); + Assert.assertEquals(new String(Character.toChars(0x1f3ff)), lh4.substring(2, 3)); + Assert.assertEquals("a" + new String(Character.toChars(0x1f645)), lh4.substring(0, 2)); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.CODEPOINTS); + Assert.assertEquals("k", lh5.substring(0, 1)); + Assert.assertEquals("\u035f", lh5.substring(1, 2)); + } + + @Test + public void testSubstringGrapheme2() { + + final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME); + Assert.assertEquals("a", lh.substring(0, 1)); + + final UnicodeHelper lh2 = new UnicodeHelper(new String(Character.toChars(0x1f600)), Method.GRAPHEME); + Assert.assertEquals(new String(Character.toChars(0x1f600)), lh2.substring(0, 1)); + + final UnicodeHelper lh3 = new UnicodeHelper(UCSTR, Method.GRAPHEME); + Assert.assertEquals(UCSTR, lh3.substring(0, 1)); + + final UnicodeHelper lh4 = new UnicodeHelper("a" + UCSTR + "A", Method.GRAPHEME); + Assert.assertEquals("a", lh4.substring(0, 1)); + Assert.assertEquals(UCSTR, lh4.substring(1, 2)); + Assert.assertEquals("A", lh4.substring(2, 3)); + Assert.assertEquals("a" + UCSTR, lh4.substring(0, 2)); + + final UnicodeHelper lh5 = new UnicodeHelper("k\u035fh", Method.GRAPHEME); + Assert.assertEquals("k\u035f", lh5.substring(0, 1)); + Assert.assertEquals("h", lh5.substring(1, 2)); + } + + @Test + public void testIndexOf() { + final UnicodeHelper lh = new UnicodeHelper("a", Method.CODEPOINTS); + Assert.assertEquals(-1, lh.indexOf("b")); + + final UnicodeHelper lh2 = new UnicodeHelper( + "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z", + Method.CODEPOINTS); + + Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600)))); + + Assert.assertEquals(3, lh2.indexOf(UCSTR)); + + Assert.assertEquals(10, lh2.indexOf("\u035f")); + + expectIllegalArgumentException(() -> { + lh2.indexOf(Character.toString(Character.toChars(0x1f600)[0])); // UTF-16 surrogates are no codepoints. + }); + } + + @Test + public void testIndexOf2() { + final UnicodeHelper lh = new UnicodeHelper("a", Method.GRAPHEME); + Assert.assertEquals(-1, lh.indexOf("b")); + + final UnicodeHelper lh2 = new UnicodeHelper( + "a" + new String(Character.toChars(0x1f600)) + "a" + UCSTR + "A" + "k\u035fh" + "z", + Method.GRAPHEME); + + Assert.assertEquals(1, lh2.indexOf(new String(Character.toChars(0x1f600)))); + + Assert.assertEquals(3, lh2.indexOf(UCSTR)); + + expectIllegalArgumentException(() -> { + lh2.indexOf("\u035f"); // Codepoint of dangling combing char is not a "unicode char". + }); + } + + private void expectIllegalArgumentException(final Runnable r) { + try { + r.run(); + Assert.assertTrue("We do not expect to reach here -- missing IllegalArgumentException.", false); + + } catch (final IllegalArgumentException e) { + LOG.debug("Caught expected IllegalArgumentException", e); + + } + } + + private static String cps2String(final int... cps) { + final StringBuilder buf = new StringBuilder(); + for (int cp : cps) { + buf.append(Character.toChars(cp)); + } + final String result = buf.toString(); + + if (LOG.isDebugEnabled()) { + final String cpStr = Arrays.stream(cps).boxed() + .map(i -> "0x" + Integer.toString(i, 16)) + .collect(Collectors.joining(", ")); + LOG.debug("Built string '{}' from CPs [ {} ].", result, cpStr); + } + + return result; + } +} diff --git a/parent/pom.xml b/parent/pom.xml index a94002e..7e43826 100644 --- a/parent/pom.xml +++ b/parent/pom.xml @@ -282,6 +282,7 @@ <hystrix-bundle-version>1.5.18_1</hystrix-bundle-version> <ibatis-bundle-version>2.3.4.726_4</ibatis-bundle-version> <ical4j-version>1.0.7</ical4j-version> + <icu4j-version>65.1</icu4j-version> <ignite-version>2.7.6</ignite-version> <infinispan-version>10.1.1.Final</infinispan-version> <influx-java-driver-version>2.17</influx-java-driver-version>