This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 78b08e4  convert types per value for array with mixing types (#7234)
78b08e4 is described below

commit 78b08e4af41adc2e5e6d7387960df0524e2a7e90
Author: Xiaobing <61892277+klsi...@users.noreply.github.com>
AuthorDate: Fri Jul 30 17:31:26 2021 -0700

    convert types per value for array with mixing types (#7234)
    
    1. convert types for individual value in array, handling cases where array 
contains mixing types
    2. for safety, stop skipping type conversion based on the speculated type 
for values in array
    3. cast to Number type for numeric PinotDataTypes to make conversion a bit 
more general
---
 .../apache/pinot/common/utils/PinotDataType.java   | 122 ++++++++++++++-------
 .../pinot/common/utils/PinotDataTypeTest.java      |  59 +++++++++-
 .../recordtransformer/DataTypeTransformer.java     |  14 ++-
 .../recordtransformer/RecordTransformerTest.java   |   6 +-
 4 files changed, 153 insertions(+), 48 deletions(-)

diff --git 
a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java 
b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
index d6df3e4..cb4b5fa 100644
--- 
a/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
+++ 
b/pinot-common/src/main/java/org/apache/pinot/common/utils/PinotDataType.java
@@ -119,22 +119,22 @@ public enum PinotDataType {
   BYTE {
     @Override
     public int toInt(Object value) {
-      return ((Byte) value).intValue();
+      return ((Number) value).intValue();
     }
 
     @Override
     public long toLong(Object value) {
-      return ((Byte) value).longValue();
+      return ((Number) value).longValue();
     }
 
     @Override
     public float toFloat(Object value) {
-      return ((Byte) value).floatValue();
+      return ((Number) value).floatValue();
     }
 
     @Override
     public double toDouble(Object value) {
-      return ((Byte) value).doubleValue();
+      return ((Number) value).doubleValue();
     }
 
     @Override
@@ -203,22 +203,22 @@ public enum PinotDataType {
   SHORT {
     @Override
     public int toInt(Object value) {
-      return ((Short) value).intValue();
+      return ((Number) value).intValue();
     }
 
     @Override
     public long toLong(Object value) {
-      return ((Short) value).longValue();
+      return ((Number) value).longValue();
     }
 
     @Override
     public float toFloat(Object value) {
-      return ((Short) value).floatValue();
+      return ((Number) value).floatValue();
     }
 
     @Override
     public double toDouble(Object value) {
-      return ((Short) value).doubleValue();
+      return ((Number) value).doubleValue();
     }
 
     @Override
@@ -250,17 +250,17 @@ public enum PinotDataType {
 
     @Override
     public long toLong(Object value) {
-      return ((Integer) value).longValue();
+      return ((Number) value).longValue();
     }
 
     @Override
     public float toFloat(Object value) {
-      return ((Integer) value).floatValue();
+      return ((Number) value).floatValue();
     }
 
     @Override
     public double toDouble(Object value) {
-      return ((Integer) value).doubleValue();
+      return ((Number) value).doubleValue();
     }
 
     @Override
@@ -292,7 +292,7 @@ public enum PinotDataType {
   LONG {
     @Override
     public int toInt(Object value) {
-      return ((Long) value).intValue();
+      return ((Number) value).intValue();
     }
 
     @Override
@@ -302,12 +302,12 @@ public enum PinotDataType {
 
     @Override
     public float toFloat(Object value) {
-      return ((Long) value).floatValue();
+      return ((Number) value).floatValue();
     }
 
     @Override
     public double toDouble(Object value) {
-      return ((Long) value).doubleValue();
+      return ((Number) value).doubleValue();
     }
 
     @Override
@@ -339,12 +339,12 @@ public enum PinotDataType {
   FLOAT {
     @Override
     public int toInt(Object value) {
-      return ((Float) value).intValue();
+      return ((Number) value).intValue();
     }
 
     @Override
     public long toLong(Object value) {
-      return ((Float) value).longValue();
+      return ((Number) value).longValue();
     }
 
     @Override
@@ -354,7 +354,7 @@ public enum PinotDataType {
 
     @Override
     public double toDouble(Object value) {
-      return ((Float) value).doubleValue();
+      return ((Number) value).doubleValue();
     }
 
     @Override
@@ -386,17 +386,17 @@ public enum PinotDataType {
   DOUBLE {
     @Override
     public int toInt(Object value) {
-      return ((Double) value).intValue();
+      return ((Number) value).intValue();
     }
 
     @Override
     public long toLong(Object value) {
-      return ((Double) value).longValue();
+      return ((Number) value).longValue();
     }
 
     @Override
     public float toFloat(Object value) {
-      return ((Double) value).floatValue();
+      return ((Number) value).floatValue();
     }
 
     @Override
@@ -867,7 +867,11 @@ public enum PinotDataType {
       int[] intArray = new int[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        intArray[i] = singleValueType.toInt(valueArray[i]);
+        try {
+          intArray[i] = singleValueType.toInt(valueArray[i]);
+        } catch (ClassCastException e) {
+          intArray[i] = anyToInt(valueArray[i]);
+        }
       }
       return intArray;
     }
@@ -885,7 +889,11 @@ public enum PinotDataType {
       Integer[] integerArray = new Integer[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        integerArray[i] = singleValueType.toInt(valueArray[i]);
+        try {
+          integerArray[i] = singleValueType.toInt(valueArray[i]);
+        } catch (ClassCastException e) {
+          integerArray[i] = anyToInt(valueArray[i]);
+        }
       }
       return integerArray;
     }
@@ -903,7 +911,11 @@ public enum PinotDataType {
       long[] longArray = new long[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        longArray[i] = singleValueType.toLong(valueArray[i]);
+        try {
+          longArray[i] = singleValueType.toLong(valueArray[i]);
+        } catch (ClassCastException e) {
+          longArray[i] = anyToLong(valueArray[i]);
+        }
       }
       return longArray;
     }
@@ -921,7 +933,11 @@ public enum PinotDataType {
       Long[] longArray = new Long[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        longArray[i] = singleValueType.toLong(valueArray[i]);
+        try {
+          longArray[i] = singleValueType.toLong(valueArray[i]);
+        } catch (ClassCastException e) {
+          longArray[i] = anyToLong(valueArray[i]);
+        }
       }
       return longArray;
     }
@@ -939,7 +955,11 @@ public enum PinotDataType {
       float[] floatArray = new float[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        floatArray[i] = singleValueType.toFloat(valueArray[i]);
+        try {
+          floatArray[i] = singleValueType.toFloat(valueArray[i]);
+        } catch (ClassCastException e) {
+          floatArray[i] = anyToFloat(valueArray[i]);
+        }
       }
       return floatArray;
     }
@@ -957,7 +977,11 @@ public enum PinotDataType {
       Float[] floatArray = new Float[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        floatArray[i] = singleValueType.toFloat(valueArray[i]);
+        try {
+          floatArray[i] = singleValueType.toFloat(valueArray[i]);
+        } catch (ClassCastException e) {
+          floatArray[i] = anyToFloat(valueArray[i]);
+        }
       }
       return floatArray;
     }
@@ -975,7 +999,11 @@ public enum PinotDataType {
       double[] doubleArray = new double[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+        try {
+          doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+        } catch (ClassCastException e) {
+          doubleArray[i] = anyToDouble(valueArray[i]);
+        }
       }
       return doubleArray;
     }
@@ -993,7 +1021,11 @@ public enum PinotDataType {
       Double[] doubleArray = new Double[length];
       PinotDataType singleValueType = getSingleValueType();
       for (int i = 0; i < length; i++) {
-        doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+        try {
+          doubleArray[i] = singleValueType.toDouble(valueArray[i]);
+        } catch (ClassCastException e) {
+          doubleArray[i] = anyToDouble(valueArray[i]);
+        }
       }
       return doubleArray;
     }
@@ -1096,6 +1128,22 @@ public enum PinotDataType {
     return (pdt != null) ? pdt : OBJECT_ARRAY;
   }
 
+  private static int anyToInt(Object val) {
+    return getSingleValueType(val.getClass()).toInt(val);
+  }
+
+  private static long anyToLong(Object val) {
+    return getSingleValueType(val.getClass()).toLong(val);
+  }
+
+  private static float anyToFloat(Object val) {
+    return getSingleValueType(val.getClass()).toFloat(val);
+  }
+
+  private static double anyToDouble(Object val) {
+    return getSingleValueType(val.getClass()).toDouble(val);
+  }
+
   /**
    * Returns the {@link PinotDataType} for the given {@link FieldSpec} for 
data ingestion purpose. Returns object array
    * type for multi-valued types.
@@ -1105,36 +1153,36 @@ public enum PinotDataType {
     DataType dataType = fieldSpec.getDataType();
     switch (dataType) {
       case INT:
-        return fieldSpec.isSingleValueField() ? PinotDataType.INTEGER : 
PinotDataType.INTEGER_ARRAY;
+        return fieldSpec.isSingleValueField() ? INTEGER : INTEGER_ARRAY;
       case LONG:
-        return fieldSpec.isSingleValueField() ? PinotDataType.LONG : 
PinotDataType.LONG_ARRAY;
+        return fieldSpec.isSingleValueField() ? LONG : LONG_ARRAY;
       case FLOAT:
-        return fieldSpec.isSingleValueField() ? PinotDataType.FLOAT : 
PinotDataType.FLOAT_ARRAY;
+        return fieldSpec.isSingleValueField() ? FLOAT : FLOAT_ARRAY;
       case DOUBLE:
-        return fieldSpec.isSingleValueField() ? PinotDataType.DOUBLE : 
PinotDataType.DOUBLE_ARRAY;
+        return fieldSpec.isSingleValueField() ? DOUBLE : DOUBLE_ARRAY;
       case BOOLEAN:
         if (fieldSpec.isSingleValueField()) {
-          return PinotDataType.BOOLEAN;
+          return BOOLEAN;
         } else {
           throw new IllegalStateException("There is no multi-value type for 
BOOLEAN");
         }
       case TIMESTAMP:
         if (fieldSpec.isSingleValueField()) {
-          return PinotDataType.TIMESTAMP;
+          return TIMESTAMP;
         } else {
           throw new IllegalStateException("There is no multi-value type for 
TIMESTAMP");
         }
       case JSON:
         if (fieldSpec.isSingleValueField()) {
-          return PinotDataType.JSON;
+          return JSON;
         } else {
           throw new IllegalStateException("There is no multi-value type for 
JSON");
         }
       case STRING:
-        return fieldSpec.isSingleValueField() ? PinotDataType.STRING : 
PinotDataType.STRING_ARRAY;
+        return fieldSpec.isSingleValueField() ? STRING : STRING_ARRAY;
       case BYTES:
         if (fieldSpec.isSingleValueField()) {
-          return PinotDataType.BYTES;
+          return BYTES;
         } else {
           throw new IllegalStateException("There is no multi-value type for 
BYTES");
         }
diff --git 
a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
 
b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
index 9ca7d4e..e1e3e39 100644
--- 
a/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
+++ 
b/pinot-common/src/test/java/org/apache/pinot/common/utils/PinotDataTypeTest.java
@@ -46,6 +46,20 @@ public class PinotDataTypeTest {
           (byte) 123), Character.toString((char) 123), Short.toString((short) 
123), Integer.toString(
           123), Long.toString(123L), Float.toString(123f), 
Double.toString(123d), " 123"};
 
+  // Test cases where array for MV column contains values of mixing types.
+  private static final PinotDataType[] SOURCE_ARRAY_TYPES =
+      {SHORT_ARRAY, INTEGER_ARRAY, LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY};
+  private static final Object[] SOURCE_ARRAY_VALUES = new Object[]{(short) 
123, 4, 5L, 6f, 7d, "8"};
+
+  private static final PinotDataType[] DEST_ARRAY_TYPES = {INTEGER_ARRAY, 
LONG_ARRAY, FLOAT_ARRAY, DOUBLE_ARRAY};
+  private static final Object[] EXPECTED_DEST_ARRAY_VALUES =
+      {new Object[]{123, 4, 5, 6, 7, 8}, new Object[]{123L, 4L, 5L, 6L, 7L, 
8L}, new Object[]{123f, 4f, 5f, 6f, 7f, 8f}, new Object[]{123d, 4d, 5d, 6d, 7d, 
8d}};
+
+  private static final PinotDataType[] DEST_PRIMITIVE_ARRAY_TYPES =
+      {PRIMITIVE_INT_ARRAY, PRIMITIVE_LONG_ARRAY, PRIMITIVE_FLOAT_ARRAY, 
PRIMITIVE_DOUBLE_ARRAY};
+  private static final Object[] EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES =
+      {new int[]{123, 4, 5, 6, 7, 8}, new long[]{123L, 4L, 5L, 6L, 7L, 8L}, 
new float[]{123f, 4f, 5f, 6f, 7f, 8f}, new double[]{123d, 4d, 5d, 6d, 7d, 8d}};
+
   @Test
   public void testNumberConversion() {
     int numDestTypes = DEST_TYPES.length;
@@ -61,6 +75,36 @@ public class PinotDataTypeTest {
   }
 
   @Test
+  public void testConversionWithMixTypes() {
+    int numDestTypes = DEST_ARRAY_TYPES.length;
+    for (int i = 0; i < numDestTypes; i++) {
+      PinotDataType destType = DEST_ARRAY_TYPES[i];
+      Object expectedDestValue = EXPECTED_DEST_ARRAY_VALUES[i];
+      for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) {
+        Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES, 
sourceArrayType);
+        assertEquals(actualDestValue, expectedDestValue);
+      }
+    }
+
+    numDestTypes = DEST_PRIMITIVE_ARRAY_TYPES.length;
+    for (int i = 0; i < numDestTypes; i++) {
+      PinotDataType destType = DEST_PRIMITIVE_ARRAY_TYPES[i];
+      Object expectedDestValue = EXPECTED_DEST_PRIMITIVE_ARRAY_VALUES[i];
+      for (PinotDataType sourceArrayType : SOURCE_ARRAY_TYPES) {
+        Object actualDestValue = destType.convert(SOURCE_ARRAY_VALUES, 
sourceArrayType);
+        assertEquals(actualDestValue, expectedDestValue);
+      }
+    }
+
+    try {
+      INTEGER_ARRAY.convert(new Object[]{"abc"}, LONG_ARRAY);
+      fail();
+    } catch (NumberFormatException e) {
+      // expected to reach here
+    }
+  }
+
+  @Test
   public void testToString() {
     int numSourceTypes = SOURCE_TYPES.length;
     for (int i = 0; i < numSourceTypes; i++) {
@@ -97,7 +141,7 @@ public class PinotDataTypeTest {
     assertEquals(STRING.convert(new byte[]{0, 1}, BYTES), "0001");
     assertEquals(BYTES.convert("0001", STRING), new byte[]{0, 1});
     assertEquals(BYTES.convert(new byte[]{0, 1}, BYTES), new byte[]{0, 1});
-    assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0,1});
+    assertEquals(BYTES.convert("AAE=", JSON), new byte[]{0, 1});
     assertEquals(BYTES.convert(new Byte[]{0, 1}, BYTE_ARRAY), new byte[]{0, 
1});
     assertEquals(BYTES.convert(new String[]{"0001"}, STRING_ARRAY), new 
byte[]{0, 1});
   }
@@ -116,7 +160,10 @@ public class PinotDataTypeTest {
     assertEquals(JSON.convert(false, BOOLEAN), "false");
     assertEquals(JSON.convert(true, BOOLEAN), "true");
     assertEquals(JSON.convert(new byte[]{0, 1}, BYTES), "\"AAE=\""); // Base64 
encoding.
-    
assertEquals(JSON.convert("{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}",
 STRING), 
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
+    assertEquals(JSON.convert(
+        
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}",
+        STRING),
+        
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
     assertEquals(JSON.convert(new Timestamp(1620324238610l), TIMESTAMP), 
"1620324238610");
   }
 
@@ -129,7 +176,8 @@ public class PinotDataTypeTest {
     assertTrue(OBJECT.toBoolean(new NumberObject("123")));
     assertEquals(OBJECT.toTimestamp(new NumberObject("123")).getTime(), 123L);
     assertEquals(OBJECT.toString(new NumberObject("123")), "123");
-    assertEquals(OBJECT.toJson(getGenericTestObject()), 
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
+    assertEquals(OBJECT.toJson(getGenericTestObject()),
+        
"{\"bytes\":\"AAE=\",\"map\":{\"key1\":\"value\",\"key2\":null,\"array\":[-5.4,4,\"2\"]},\"timestamp\":1620324238610}");
     assertEquals(OBJECT_ARRAY.getSingleValueType(), OBJECT);
     // Non-zero value is treated as true.
     assertTrue(OBJECT.toBoolean(1.1d));
@@ -181,13 +229,13 @@ public class PinotDataTypeTest {
 
   private static Object getGenericTestObject() {
     Map<String, Object> map1 = new HashMap<>();
-    map1.put("array", Arrays.asList(-5.4,4, "2"));
+    map1.put("array", Arrays.asList(-5.4, 4, "2"));
     map1.put("key1", "value");
     map1.put("key2", null);
 
     Map<String, Object> map2 = new HashMap<>();
     map2.put("map", map1);
-    map2.put("bytes", new byte[]{0,1});
+    map2.put("bytes", new byte[]{0, 1});
     map2.put("timestamp", new Timestamp(1620324238610l));
 
     return map2;
@@ -222,7 +270,6 @@ public class PinotDataTypeTest {
     }
 
     assertInvalidConversion("xyz", STRING, JSON, RuntimeException.class);
-
   }
 
   private void assertInvalidConversion(Object value, PinotDataType sourceType, 
PinotDataType destType,
diff --git 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
index bbc6d7d..fbf18bf 100644
--- 
a/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
+++ 
b/pinot-segment-local/src/main/java/org/apache/pinot/segment/local/recordtransformer/DataTypeTransformer.java
@@ -77,9 +77,17 @@ public class DataTypeTransformer implements 
RecordTransformer {
           // Single-value column
           source = PinotDataType.getSingleValueType(value.getClass());
         }
-        if (source != dest) {
-          value = dest.convert(value, source);
-        }
+        // Skipping conversion when srcType!=destType is speculative, and can 
be unsafe when
+        // the array for MV column contains values of mixing types. Mixing 
types can lead
+        // to ClassCastException during conversion, often aborting data 
ingestion jobs.
+        //
+        // So now, calling convert() unconditionally for safety. Perf impact 
is negligible:
+        // 1. for SV column, when srcType=destType, the conversion is simply 
pass through.
+        // 2. for MV column, when srcType=destType, the conversion is simply 
pass through
+        // if the source type is not Object[] (but sth like Integer[], 
Double[]). For Object[],
+        // the conversion loops through values in the array like before, but 
can catch the
+        // ClassCastException if it happens and continue the conversion now.
+        value = dest.convert(value, source);
         value = dest.toInternal(value);
 
         record.putValue(column, value);
diff --git 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
index 367be7e..e711697 100644
--- 
a/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
+++ 
b/pinot-segment-local/src/test/java/org/apache/pinot/segment/local/recordtransformer/RecordTransformerTest.java
@@ -150,8 +150,10 @@ public class RecordTransformerTest {
       assertEquals(record.getValue("mvDouble"), new Object[]{123d});
       assertEquals(record.getValue("svStringWithNullCharacters"), 
"1\0002\0003");
       assertEquals(record.getValue("svStringWithLengthLimit"), "123");
-      // NOTE: We identify the array type by the first element, so data type 
conversion only applied to 'mvString2'
-      assertEquals(record.getValue("mvString1"), new Object[]{"123", 123, 
123L, 123f, 123.0});
+      // NOTE: We used to speculate the array type by the first element, but 
has changed
+      // to convert type for values in array based on their real value, making 
the logic
+      // safe to handle array of values with mixing types.
+      assertEquals(record.getValue("mvString1"), new Object[]{"123", "123", 
"123", "123.0", "123.0"});
       assertEquals(record.getValue("mvString2"), new Object[]{"123", "123", 
"123.0", "123.0", "123"});
       assertNull(record.getValue("$virtual"));
       assertTrue(record.getNullValueFields().isEmpty());

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org
For additional commands, e-mail: commits-h...@pinot.apache.org

Reply via email to