This is an automated email from the ASF dual-hosted git repository. jackie pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push: new 78b08e4 convert types per value for array with mixing types (#7234) 78b08e4 is described below commit 78b08e4af41adc2e5e6d7387960df0524e2a7e90 Author: Xiaobing <61892277+klsi...@users.noreply.github.com> AuthorDate: Fri Jul 30 17:31:26 2021 -0700 convert types per value for array with mixing types (#7234) 1. convert types for individual value in array, handling cases where array contains mixing types 2. for safety, stop skipping type conversion based on the speculated type for values in array 3. cast to Number type for numeric PinotDataTypes to make conversion a bit more general --- .../apache/pinot/common/utils/PinotDataType.java | 122 ++++++++++++++------- .../pinot/common/utils/PinotDataTypeTest.java | 59 +++++++++- .../recordtransformer/DataTypeTransformer.java | 14 ++- .../recordtransformer/RecordTransformerTest.java | 6 +- 4 files changed, 153 insertions(+), 48 deletions(-) diff --git a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java index d6df3e4..cb4b5fa 100644 --- a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java +++ b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java @@ -119,22 +119,22 @@ public enum PinotDataType { BYTE { @Override public int toInt(Object value) { - return ((Byte) value).intValue(); + return ((Number) value).intValue(); } @Override public long toLong(Object value) { - return ((Byte) value).longValue(); + return ((Number) value).longValue(); } @Override public float toFloat(Object value) { - return ((Byte) value).floatValue(); + return ((Number) value).floatValue(); } @Override public double toDouble(Object value) { - return ((Byte) value).doubleValue(); + return ((Number) value).doubleValue(); } @Override @@ -203,22 +203,22 @@ public enum PinotDataType { SHORT { @Override public int toInt(Object value) { - return ((Short) value).intValue(); + return ((Number) value).intValue(); } @Override public long toLong(Object value) { - return ((Short) value).longValue(); + return ((Number) value).longValue(); } @Override public float toFloat(Object value) { - return ((Short) value).floatValue(); + return ((Number) value).floatValue(); } @Override public double toDouble(Object value) { - return ((Short) value).doubleValue(); + return ((Number) value).doubleValue(); } @Override @@ -250,17 +250,17 @@ public enum PinotDataType { @Override public long toLong(Object value) { - return ((Integer) value).longValue(); + return ((Number) value).longValue(); } @Override public float toFloat(Object value) { - return ((Integer) value).floatValue(); + return ((Number) value).floatValue(); } @Override public double toDouble(Object value) { - return ((Integer) value).doubleValue(); + return ((Number) value).doubleValue(); } @Override @@ -292,7 +292,7 @@ public enum PinotDataType { LONG { @Override public int toInt(Object value) { - return ((Long) value).intValue(); + return ((Number) value).intValue(); } @Override @@ -302,12 +302,12 @@ public enum PinotDataType { @Override public float toFloat(Object value) { - return ((Long) value).floatValue(); + return ((Number) value).floatValue(); } @Override public double toDouble(Object value) { - return ((Long) value).doubleValue(); + return ((Number) value).doubleValue(); } @Override @@ -339,12 +339,12 @@ public enum PinotDataType { FLOAT { @Override public int toInt(Object value) { - return ((Float) value).intValue(); + return ((Number) value).intValue(); } @Override public long toLong(Object value) { - return ((Float) value).longValue(); + return ((Number) value).longValue(); } @Override @@ -354,7 +354,7 @@ public enum PinotDataType { @Override public double toDouble(Object value) { - return ((Float) value).doubleValue(); + return ((Number) value).doubleValue(); } @Override @@ -386,17 +386,17 @@ public enum PinotDataType { DOUBLE { @Override public int toInt(Object value) { - return ((Double) value).intValue(); + return ((Number) value).intValue(); } @Override public long toLong(Object value) { - return ((Double) value).longValue(); + return ((Number) value).longValue(); } @Override public float toFloat(Object value) { - return ((Double) value).floatValue(); + return ((Number) value).floatValue(); } @Override @@ -867,7 +867,11 @@ public enum PinotDataType { int[] intArray = new int[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - intArray[i] = singleValueType.toInt(valueArray[i]); + try { + intArray[i] = singleValueType.toInt(valueArray[i]); + } catch (ClassCastException e) { + intArray[i] = anyToInt(valueArray[i]); + } } return intArray; } @@ -885,7 +889,11 @@ public enum PinotDataType { Integer[] integerArray = new Integer[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - integerArray[i] = singleValueType.toInt(valueArray[i]); + try { + integerArray[i] = singleValueType.toInt(valueArray[i]); + } catch (ClassCastException e) { + integerArray[i] = anyToInt(valueArray[i]); + } } return integerArray; } @@ -903,7 +911,11 @@ public enum PinotDataType { long[] longArray = new long[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - longArray[i] = singleValueType.toLong(valueArray[i]); + try { + longArray[i] = singleValueType.toLong(valueArray[i]); + } catch (ClassCastException e) { + longArray[i] = anyToLong(valueArray[i]); + } } return longArray; } @@ -921,7 +933,11 @@ public enum PinotDataType { Long[] longArray = new Long[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - longArray[i] = singleValueType.toLong(valueArray[i]); + try { + longArray[i] = singleValueType.toLong(valueArray[i]); + } catch (ClassCastException e) { + longArray[i] = anyToLong(valueArray[i]); + } } return longArray; } @@ -939,7 +955,11 @@ public enum PinotDataType { float[] floatArray = new float[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - floatArray[i] = singleValueType.toFloat(valueArray[i]); + try { + floatArray[i] = singleValueType.toFloat(valueArray[i]); + } catch (ClassCastException e) { + floatArray[i] = anyToFloat(valueArray[i]); + } } return floatArray; } @@ -957,7 +977,11 @@ public enum PinotDataType { Float[] floatArray = new Float[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - floatArray[i] = singleValueType.toFloat(valueArray[i]); + try { + floatArray[i] = singleValueType.toFloat(valueArray[i]); + } catch (ClassCastException e) { + floatArray[i] = anyToFloat(valueArray[i]); + } } return floatArray; } @@ -975,7 +999,11 @@ public enum PinotDataType { double[] doubleArray = new double[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - doubleArray[i] = singleValueType.toDouble(valueArray[i]); + try { + doubleArray[i] = singleValueType.toDouble(valueArray[i]); + } catch (ClassCastException e) { + doubleArray[i] = anyToDouble(valueArray[i]); + } } return doubleArray; } @@ -993,7 +1021,11 @@ public enum PinotDataType { Double[] doubleArray = new Double[length]; PinotDataType singleValueType = getSingleValueType(); for (int i = 0; i < length; i++) { - doubleArray[i] = singleValueType.toDouble(valueArray[i]); + try { + doubleArray[i] = singleValueType.toDouble(valueArray[i]); + } catch (ClassCastException e) { + doubleArray[i] = anyToDouble(valueArray[i]); + } } return doubleArray; } @@ -1096,6 +1128,22 @@ public enum PinotDataType { return (pdt != null) ? pdt : OBJECT_ARRAY; } + private static int anyToInt(Object val) { + return getSingleValueType(val.getClass()).toInt(val); + } + + private static long anyToLong(Object val) { + return getSingleValueType(val.getClass()).toLong(val); + } + + private static float anyToFloat(Object val) { + return getSingleValueType(val.getClass()).toFloat(val); + } + + private static double anyToDouble(Object val) { + return getSingleValueType(val.getClass()).toDouble(val); + } + /** * Returns the {@link PinotDataType} for the given {@link FieldSpec} for data ingestion purpose. Returns object array * type for multi-valued types. @@ -1105,36 +1153,36 @@ public enum PinotDataType { DataType dataType = fieldSpec.getDataType(); switch (dataType) { case INT: - return fieldSpec.isSingleValueField() ? PinotDataType.INTEGER : PinotDataType.INTEGER_ARRAY; + return fieldSpec.isSingleValueField() ? INTEGER : INTEGER_ARRAY; case LONG: - return fieldSpec.isSingleValueField() ? PinotDataType.LONG : PinotDataType.LONG_ARRAY; + return fieldSpec.isSingleValueField() ? LONG : LONG_ARRAY; case FLOAT: - return fieldSpec.isSingleValueField() ? PinotDataType.FLOAT : PinotDataType.FLOAT_ARRAY; + return fieldSpec.isSingleValueField() ? FLOAT : FLOAT_ARRAY; case DOUBLE: - return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : PinotDataType.DOUBLE_ARRAY; + return fieldSpec.isSingleValueField() ? DOUBLE : DOUBLE_ARRAY; case BOOLEAN: if (fieldSpec.isSingleValueField()) { - return PinotDataType.BOOLEAN; + return BOOLEAN; } else { throw new IllegalStateException("There is no multi-value type for BOOLEAN"); } case TIMESTAMP: if (fieldSpec.isSingleValueField()) { - return PinotDataType.TIMESTAMP; + return TIMESTAMP; } else { throw new IllegalStateException("There is no multi-value type for TIMESTAMP"); } case JSON: if (fieldSpec.isSingleValueField()) { - return PinotDataType.JSON; + return JSON; } else { throw new IllegalStateException("There is no multi-value type for JSON"); } case STRING: - return fieldSpec.isSingleValueField() ? PinotDataType.STRING : PinotDataType.STRING_ARRAY; + return fieldSpec.isSingleValueField() ? STRING : STRING_ARRAY; case BYTES: if (fieldSpec.isSingleValueField()) { - return PinotDataType.BYTES; + return BYTES; } else { throw new IllegalStateException("There is no multi-value type for BYTES"); } diff --git a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java index 9ca7d4e..e1e3e39 100644 --- a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java +++ b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java @@ -46,6 +46,20 @@ public class PinotDataTypeTest { (byte) 123), Character.toString((char) 123), Short.toString((short) 123), Integer.toString( 123), Long.toString(123L), Float.toString(123f), Double.toString(123d), " 123"}; + // Test cases where array for MV column contains values of mixing types. + private static final PinotDataType[] SOURCE_ARRAY_TYPES = + {SHORT_ARRAY, INTEGER_ARRAY, LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY}; + private static final Object[] SOURCE_ARRAY_VALUES = new Object[]{(short) 123, 4, 5L, 6f, 7d, "8"}; + + private static final PinotDataType[] DEST_ARRAY_TYPES = {INTEGER_ARRAY, LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY}; + private static final Object[] EXPECTED_DEST_ARRAY_VALUES = + {new Object[]{123, 4, 5, 6, 7, 8}, new Object[]{123L, 4L, 5L, 6L, 7L, 8L}, new Object[]{123f, 4f, 5f, 6f, 7f, 8f}, new Object[]{123d, 4d, 5d, 6d, 7d, 8d}}; + + private static final PinotDataType[] DEST_PRIMITIVE_ARRAY_TYPES = + {PRIMITIVE_INT_ARRAY, PRIMITIVE_LONG_ARRAY, PRIMITIVE_FLOAT_ARRAY, PRIMITIVE_DOUBLE_ARRAY}; + private static final Object[] EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES = + {new int[]{123, 4, 5, 6, 7, 8}, new long[]{123L, 4L, 5L, 6L, 7L, 8L}, new float[]{123f, 4f, 5f, 6f, 7f, 8f}, new double[]{123d, 4d, 5d, 6d, 7d, 8d}}; + @Test public void testNumberConversion() { int numDestTypes = DEST_TYPES.length; @@ -61,6 +75,36 @@ public class PinotDataTypeTest { } @Test + public void testConversionWithMixTypes() { + int numDestTypes = DEST_ARRAY_TYPES.length; + for (int i = 0; i < numDestTypes; i++) { + PinotDataType destType = DEST_ARRAY_TYPES[i]; + Object expectedDestValue = EXPECTED_DEST_ARRAY_VALUES[i]; + for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) { + Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES, sourceArrayType); + assertEquals(actualDestValue, expectedDestValue); + } + } + + numDestTypes = DEST_PRIMITIVE_ARRAY_TYPES.length; + for (int i = 0; i < numDestTypes; i++) { + PinotDataType destType = DEST_PRIMITIVE_ARRAY_TYPES[i]; + Object expectedDestValue = EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES[i]; + for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) { + Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES, sourceArrayType); + assertEquals(actualDestValue, expectedDestValue); + } + } + + try { + INTEGER_ARRAY.convert(new Object[]{"abc"}, LONG_ARRAY); + fail(); + } catch (NumberFormatException e) { + // expected to reach here + } + } + + @Test public void testToString() { int numSourceTypes = SOURCE_TYPES.length; for (int i = 0; i < numSourceTypes; i++) { @@ -97,7 +141,7 @@ public class PinotDataTypeTest { assertEquals(STRING.convert(new byte[]{0, 1}, BYTES), "0001"); assertEquals(BYTES.convert("0001", STRING), new byte[]{0, 1}); assertEquals(BYTES.convert(new byte[]{0, 1}, BYTES), new byte[]{0, 1}); - assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0,1}); + assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0, 1}); assertEquals(BYTES.convert(new Byte[]{0, 1}, BYTE_ARRAY), new byte[]{0, 1}); assertEquals(BYTES.convert(new String[]{"0001"}, STRING_ARRAY), new byte[]{0, 1}); } @@ -116,7 +160,10 @@ public class PinotDataTypeTest { assertEquals(JSON.convert(false, BOOLEAN), "false"); assertEquals(JSON.convert(true, BOOLEAN), "true"); assertEquals(JSON.convert(new byte[]{0, 1}, BYTES), "\"AAE=\""); // Base64 encoding. - assertEquals(JSON.convert("{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}", STRING), "{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}"); + assertEquals(JSON.convert( + "{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}", + STRING), + "{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}"); assertEquals(JSON.convert(new Timestamp(1620324238610l), TIMESTAMP), "1620324238610"); } @@ -129,7 +176,8 @@ public class PinotDataTypeTest { assertTrue(OBJECT.toBoolean(new NumberObject("123"))); assertEquals(OBJECT.toTimestamp(new NumberObject("123")).getTime(), 123L); assertEquals(OBJECT.toString(new NumberObject("123")), "123"); - assertEquals(OBJECT.toJson(getGenericTestObject()), "{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}"); + assertEquals(OBJECT.toJson(getGenericTestObject()), + "{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}"); assertEquals(OBJECT_ARRAY.getSingleValueType(), OBJECT); // Non-zero value is treated as true. assertTrue(OBJECT.toBoolean(1.1d)); @@ -181,13 +229,13 @@ public class PinotDataTypeTest { private static Object getGenericTestObject() { Map<String, Object> map1 = new HashMap<>(); - map1.put("array", Arrays.asList(-5.4,4, "2")); + map1.put("array", Arrays.asList(-5.4, 4, "2")); map1.put("key1", "value"); map1.put("key2", null); Map<String, Object> map2 = new HashMap<>(); map2.put("map", map1); - map2.put("bytes", new byte[]{0,1}); + map2.put("bytes", new byte[]{0, 1}); map2.put("timestamp", new Timestamp(1620324238610l)); return map2; @@ -222,7 +270,6 @@ public class PinotDataTypeTest { } assertInvalidConversion("xyz", STRING, JSON, RuntimeException.class); - } private void assertInvalidConversion(Object value, PinotDataType sourceType, PinotDataType destType, diff --git a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java index bbc6d7d..fbf18bf 100644 --- a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java +++ b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java @@ -77,9 +77,17 @@ public class DataTypeTransformer implements RecordTransformer { // Single-value column source = PinotDataType.getSingleValueType(value.getClass()); } - if (source != dest) { - value = dest.convert(value, source); - } + // Skipping conversion when srcType!=destType is speculative, and can be unsafe when + // the array for MV column contains values of mixing types. Mixing types can lead + // to ClassCastException during conversion, often aborting data ingestion jobs. + // + // So now, calling convert() unconditionally for safety. Perf impact is negligible: + // 1. for SV column, when srcType=destType, the conversion is simply pass through. + // 2. for MV column, when srcType=destType, the conversion is simply pass through + // if the source type is not Object[] (but sth like Integer[], Double[]). For Object[], + // the conversion loops through values in the array like before, but can catch the + // ClassCastException if it happens and continue the conversion now. + value = dest.convert(value, source); value = dest.toInternal(value); record.putValue(column, value); diff --git a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java index 367be7e..e711697 100644 --- a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java +++ b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java @@ -150,8 +150,10 @@ public class RecordTransformerTest { assertEquals(record.getValue("mvDouble"), new Object[]{123d}); assertEquals(record.getValue("svStringWithNullCharacters"), "1\0002\0003"); assertEquals(record.getValue("svStringWithLengthLimit"), "123"); - // NOTE: We identify the array type by the first element, so data type conversion only applied to 'mvString2' - assertEquals(record.getValue("mvString1"), new Object[]{"123", 123, 123L, 123f, 123.0}); + // NOTE: We used to speculate the array type by the first element, but has changed + // to convert type for values in array based on their real value, making the logic + // safe to handle array of values with mixing types. + assertEquals(record.getValue("mvString1"), new Object[]{"123", "123", "123", "123.0", "123.0"}); assertEquals(record.getValue("mvString2"), new Object[]{"123", "123", "123.0", "123.0", "123"}); assertNull(record.getValue("$virtual")); assertTrue(record.getNullValueFields().isEmpty()); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org