aihuaxu commented on code in PR #12512:
URL: https://github.com/apache/iceberg/pull/12512#discussion_r1999346711


##########
core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java:
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.variants;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.List;
+import java.util.Random;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.util.RandomUtil;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+public class TestShreddedArray {
+  private static final VariantMetadata EMPTY_METADATA =
+      Variants.metadata(VariantTestUtil.emptyMetadata());
+
+  private static final List<VariantValue> ELEMENTS =
+      ImmutableList.of(
+          Variants.of(34), Variants.of("iceberg"), Variants.of(new 
BigDecimal("12.21")));
+
+  private final Random random = new Random(871925);
+
+  @Test
+  public void testShreddedFields() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    assertThat(arr.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(arr.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(arr.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testSerializationMinimalBuffer() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    VariantValue value = roundTripMinimalBuffer(arr);
+
+    assertThat(value).isInstanceOf(SerializedArray.class);
+    SerializedArray actual = (SerializedArray) value;
+
+    assertThat(actual.numElements()).isEqualTo(3);
+    assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testSerializationLargeBuffer() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    VariantValue value = roundTripLargeBuffer(arr);
+
+    assertThat(value).isInstanceOf(SerializedArray.class);
+    SerializedArray actual = (SerializedArray) value;
+
+    assertThat(actual.numElements()).isEqualTo(3);
+    assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testMultiByteElementSize() {
+    // Create large number of elements to use 4 bytes to store element size
+    List<VariantValue> elements = Lists.newArrayList();
+    for (int i = 0; i < 100_000; i += 1) {
+      elements.add(Variants.of(RandomUtil.generateString(10, random)));
+    }
+
+    List<VariantValue> data = Lists.newArrayList();

Review Comment:
   This is dup to testLargeArray(). I can't remember why I did this now. Seems 
not necessary.



##########
parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantReaders.java:
##########
@@ -332,6 +347,91 @@ public void setPageSource(PageReadStore pageStore) {
     }
   }
 
+  private static class ShreddedArrayReader implements VariantValueReader {
+    private final int valueDL;
+    private final VariantValueReader valueReader;
+    private final int repeatedDL;
+    private final int repeatedRL;
+    private final VariantValueReader elementReader;
+    private final TripleIterator<?> valueColumn;
+    private final TripleIterator<?> elementColumn;
+    private final List<TripleIterator<?>> children;
+
+    private ShreddedArrayReader(
+        int valueDL,
+        VariantValueReader valueReader,
+        int typedDL,
+        int typedRL,
+        VariantValueReader elementReader) {
+      this.valueDL = valueDL;
+      this.valueReader = valueReader;
+      this.repeatedDL = typedDL + 1;
+      this.repeatedRL = typedRL + 1;
+      this.elementReader = elementReader;
+      this.elementColumn = this.elementReader.column();
+      this.valueColumn = valueReader != null ? valueReader.column() : 
elementColumn;
+      this.children =
+          children(Iterables.concat(Arrays.asList(valueReader), 
Arrays.asList(elementReader)));
+    }
+
+    @Override
+    public VariantValue read(VariantMetadata metadata) {
+      // if the current definition level is less to the definition level of 
the repeated
+      // type, i.e. typed_value is null, then it's not an array
+      boolean isArray = elementColumn.currentDefinitionLevel() >= repeatedDL;
+      VariantValue value = ParquetVariantReaders.read(metadata, valueReader, 
valueDL);
+
+      if (isArray) {
+        Preconditions.checkArgument(
+            value == MISSING, "Invalid variant, non-array value: %s", value);

Review Comment:
   You are right. We should check with variant null but the code is updated so 
it's not applicable. 



##########
core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java:
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.variants;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.List;
+import java.util.Random;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.util.RandomUtil;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+public class TestShreddedArray {
+  private static final VariantMetadata EMPTY_METADATA =
+      Variants.metadata(VariantTestUtil.emptyMetadata());
+
+  private static final List<VariantValue> ELEMENTS =
+      ImmutableList.of(
+          Variants.of(34), Variants.of("iceberg"), Variants.of(new 
BigDecimal("12.21")));
+
+  private final Random random = new Random(871925);
+
+  @Test
+  public void testShreddedFields() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    assertThat(arr.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(arr.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(arr.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(arr.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testSerializationMinimalBuffer() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    VariantValue value = roundTripMinimalBuffer(arr);
+
+    assertThat(value).isInstanceOf(SerializedArray.class);
+    SerializedArray actual = (SerializedArray) value;
+
+    assertThat(actual.numElements()).isEqualTo(3);
+    assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testSerializationLargeBuffer() {
+    ShreddedArray arr = createShreddedArray(ELEMENTS);
+
+    VariantValue value = roundTripLargeBuffer(arr);
+
+    assertThat(value).isInstanceOf(SerializedArray.class);
+    SerializedArray actual = (SerializedArray) value;
+
+    assertThat(actual.numElements()).isEqualTo(3);
+    assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34);
+    assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg");
+    assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class);
+    assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new 
BigDecimal("12.21"));
+  }
+
+  @Test
+  public void testMultiByteElementSize() {
+    // Create large number of elements to use 4 bytes to store element size
+    List<VariantValue> elements = Lists.newArrayList();
+    for (int i = 0; i < 100_000; i += 1) {
+      elements.add(Variants.of(RandomUtil.generateString(10, random)));
+    }
+
+    List<VariantValue> data = Lists.newArrayList();
+    data.addAll(elements);
+
+    ShreddedArray shredded = createShreddedArray(data);
+    VariantValue value = roundTripLargeBuffer(shredded);
+
+    assertThat(value.type()).isEqualTo(PhysicalType.ARRAY);
+    SerializedArray arr = (SerializedArray) value;
+    assertThat(arr.numElements()).isEqualTo(100_000);
+    for (int i = 0; i < 100_000; i++) {
+      VariantTestUtil.assertEqual(arr.get(i), elements.get(i));
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {300, 70_000, 16_777_300})
+  public void testMultiByteOffsets(int len) {
+    // Use a string exceeding 255 bytes to test value offset sizes of 2, 3, 
and 4 bytes
+    String randomString = RandomUtil.generateString(len, random);
+    SerializedPrimitive bigString = VariantTestUtil.createString(randomString);

Review Comment:
   Yeah. It's not needed. I adopted the one in TestShreddedObject for array 
test. Let me update there as well.



##########
core/src/main/java/org/apache/iceberg/variants/ShreddedArray.java:
##########
@@ -0,0 +1,130 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.variants;
+
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.List;
+import org.apache.iceberg.relocated.com.google.common.base.Preconditions;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+
+public class ShreddedArray implements VariantArray {

Review Comment:
   You are right that the class itself would be it's no meaning for shredded 
vs. unshredded for arrays and yeah, I had a hard time to figure out a better 
name. I was also thinking of naming it as `VariantValueArray`. `ValueArray` is 
a little generic but at least reflects it holds VariantValue. I will go for  
`ValueArray` for now.



##########
parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantReaders.java:
##########
@@ -332,6 +347,91 @@ public void setPageSource(PageReadStore pageStore) {
     }
   }
 
+  private static class ShreddedArrayReader implements VariantValueReader {
+    private final int valueDL;
+    private final VariantValueReader valueReader;
+    private final int repeatedDL;
+    private final int repeatedRL;
+    private final VariantValueReader elementReader;
+    private final TripleIterator<?> valueColumn;
+    private final TripleIterator<?> elementColumn;
+    private final List<TripleIterator<?>> children;
+
+    private ShreddedArrayReader(

Review Comment:
   If I understand correctly, basically we need refactoring into 
   
   - ArrayReader => read typed_value
   - ParquetVariantReaders.shredded(value, typed_value) 
   
   so we don't need to combine value and typed_value in ArrayReader explicitly.
   
   With the refactoring, I didn't introduce OptionReader which exists in 
ParquetValueReader to handle optional typed_value to determine if typed_value 
is null or not but have the logic in ArrayReader. 
   
   



##########
core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java:
##########
@@ -0,0 +1,190 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.variants;
+
+import static org.assertj.core.api.Assertions.assertThat;
+
+import java.math.BigDecimal;
+import java.nio.ByteBuffer;
+import java.nio.ByteOrder;
+import java.util.List;
+import java.util.Random;
+import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.apache.iceberg.util.RandomUtil;
+import org.junit.jupiter.api.Test;
+import org.junit.jupiter.params.ParameterizedTest;
+import org.junit.jupiter.params.provider.ValueSource;
+
+public class TestShreddedArray {
+  private static final VariantMetadata EMPTY_METADATA =
+      Variants.metadata(VariantTestUtil.emptyMetadata());
+
+  private static final List<VariantValue> ELEMENTS =
+      ImmutableList.of(
+          Variants.of(34), Variants.of("iceberg"), Variants.of(new 
BigDecimal("12.21")));
+
+  private final Random random = new Random(871925);
+
+  @Test
+  public void testShreddedFields() {

Review Comment:
   You are right. I forgot to rename. 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to