aihuaxu commented on code in PR #12512: URL: https://github.com/apache/iceberg/pull/12512#discussion_r1999346711
########## core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java: ########## @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.Random; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.RandomUtil; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestShreddedArray { + private static final VariantMetadata EMPTY_METADATA = + Variants.metadata(VariantTestUtil.emptyMetadata()); + + private static final List<VariantValue> ELEMENTS = + ImmutableList.of( + Variants.of(34), Variants.of("iceberg"), Variants.of(new BigDecimal("12.21"))); + + private final Random random = new Random(871925); + + @Test + public void testShreddedFields() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + assertThat(arr.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(arr.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(arr.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testSerializationMinimalBuffer() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + VariantValue value = roundTripMinimalBuffer(arr); + + assertThat(value).isInstanceOf(SerializedArray.class); + SerializedArray actual = (SerializedArray) value; + + assertThat(actual.numElements()).isEqualTo(3); + assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testSerializationLargeBuffer() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + VariantValue value = roundTripLargeBuffer(arr); + + assertThat(value).isInstanceOf(SerializedArray.class); + SerializedArray actual = (SerializedArray) value; + + assertThat(actual.numElements()).isEqualTo(3); + assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testMultiByteElementSize() { + // Create large number of elements to use 4 bytes to store element size + List<VariantValue> elements = Lists.newArrayList(); + for (int i = 0; i < 100_000; i += 1) { + elements.add(Variants.of(RandomUtil.generateString(10, random))); + } + + List<VariantValue> data = Lists.newArrayList(); Review Comment: This is dup to testLargeArray(). I can't remember why I did this now. Seems not necessary. ########## parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantReaders.java: ########## @@ -332,6 +347,91 @@ public void setPageSource(PageReadStore pageStore) { } } + private static class ShreddedArrayReader implements VariantValueReader { + private final int valueDL; + private final VariantValueReader valueReader; + private final int repeatedDL; + private final int repeatedRL; + private final VariantValueReader elementReader; + private final TripleIterator<?> valueColumn; + private final TripleIterator<?> elementColumn; + private final List<TripleIterator<?>> children; + + private ShreddedArrayReader( + int valueDL, + VariantValueReader valueReader, + int typedDL, + int typedRL, + VariantValueReader elementReader) { + this.valueDL = valueDL; + this.valueReader = valueReader; + this.repeatedDL = typedDL + 1; + this.repeatedRL = typedRL + 1; + this.elementReader = elementReader; + this.elementColumn = this.elementReader.column(); + this.valueColumn = valueReader != null ? valueReader.column() : elementColumn; + this.children = + children(Iterables.concat(Arrays.asList(valueReader), Arrays.asList(elementReader))); + } + + @Override + public VariantValue read(VariantMetadata metadata) { + // if the current definition level is less to the definition level of the repeated + // type, i.e. typed_value is null, then it's not an array + boolean isArray = elementColumn.currentDefinitionLevel() >= repeatedDL; + VariantValue value = ParquetVariantReaders.read(metadata, valueReader, valueDL); + + if (isArray) { + Preconditions.checkArgument( + value == MISSING, "Invalid variant, non-array value: %s", value); Review Comment: You are right. We should check with variant null but the code is updated so it's not applicable. ########## core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java: ########## @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.Random; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.RandomUtil; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestShreddedArray { + private static final VariantMetadata EMPTY_METADATA = + Variants.metadata(VariantTestUtil.emptyMetadata()); + + private static final List<VariantValue> ELEMENTS = + ImmutableList.of( + Variants.of(34), Variants.of("iceberg"), Variants.of(new BigDecimal("12.21"))); + + private final Random random = new Random(871925); + + @Test + public void testShreddedFields() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + assertThat(arr.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(arr.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(arr.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(arr.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testSerializationMinimalBuffer() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + VariantValue value = roundTripMinimalBuffer(arr); + + assertThat(value).isInstanceOf(SerializedArray.class); + SerializedArray actual = (SerializedArray) value; + + assertThat(actual.numElements()).isEqualTo(3); + assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testSerializationLargeBuffer() { + ShreddedArray arr = createShreddedArray(ELEMENTS); + + VariantValue value = roundTripLargeBuffer(arr); + + assertThat(value).isInstanceOf(SerializedArray.class); + SerializedArray actual = (SerializedArray) value; + + assertThat(actual.numElements()).isEqualTo(3); + assertThat(actual.get(0)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(0).asPrimitive().get()).isEqualTo(34); + assertThat(actual.get(1)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(1).asPrimitive().get()).isEqualTo("iceberg"); + assertThat(actual.get(2)).isInstanceOf(VariantPrimitive.class); + assertThat(actual.get(2).asPrimitive().get()).isEqualTo(new BigDecimal("12.21")); + } + + @Test + public void testMultiByteElementSize() { + // Create large number of elements to use 4 bytes to store element size + List<VariantValue> elements = Lists.newArrayList(); + for (int i = 0; i < 100_000; i += 1) { + elements.add(Variants.of(RandomUtil.generateString(10, random))); + } + + List<VariantValue> data = Lists.newArrayList(); + data.addAll(elements); + + ShreddedArray shredded = createShreddedArray(data); + VariantValue value = roundTripLargeBuffer(shredded); + + assertThat(value.type()).isEqualTo(PhysicalType.ARRAY); + SerializedArray arr = (SerializedArray) value; + assertThat(arr.numElements()).isEqualTo(100_000); + for (int i = 0; i < 100_000; i++) { + VariantTestUtil.assertEqual(arr.get(i), elements.get(i)); + } + } + + @ParameterizedTest + @ValueSource(ints = {300, 70_000, 16_777_300}) + public void testMultiByteOffsets(int len) { + // Use a string exceeding 255 bytes to test value offset sizes of 2, 3, and 4 bytes + String randomString = RandomUtil.generateString(len, random); + SerializedPrimitive bigString = VariantTestUtil.createString(randomString); Review Comment: Yeah. It's not needed. I adopted the one in TestShreddedObject for array test. Let me update there as well. ########## core/src/main/java/org/apache/iceberg/variants/ShreddedArray.java: ########## @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import org.apache.iceberg.relocated.com.google.common.base.Preconditions; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; + +public class ShreddedArray implements VariantArray { Review Comment: You are right that the class itself would be it's no meaning for shredded vs. unshredded for arrays and yeah, I had a hard time to figure out a better name. I was also thinking of naming it as `VariantValueArray`. `ValueArray` is a little generic but at least reflects it holds VariantValue. I will go for `ValueArray` for now. ########## parquet/src/main/java/org/apache/iceberg/parquet/ParquetVariantReaders.java: ########## @@ -332,6 +347,91 @@ public void setPageSource(PageReadStore pageStore) { } } + private static class ShreddedArrayReader implements VariantValueReader { + private final int valueDL; + private final VariantValueReader valueReader; + private final int repeatedDL; + private final int repeatedRL; + private final VariantValueReader elementReader; + private final TripleIterator<?> valueColumn; + private final TripleIterator<?> elementColumn; + private final List<TripleIterator<?>> children; + + private ShreddedArrayReader( Review Comment: If I understand correctly, basically we need refactoring into - ArrayReader => read typed_value - ParquetVariantReaders.shredded(value, typed_value) so we don't need to combine value and typed_value in ArrayReader explicitly. With the refactoring, I didn't introduce OptionReader which exists in ParquetValueReader to handle optional typed_value to determine if typed_value is null or not but have the logic in ArrayReader. ########## core/src/test/java/org/apache/iceberg/variants/TestShreddedArray.java: ########## @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.variants; + +import static org.assertj.core.api.Assertions.assertThat; + +import java.math.BigDecimal; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.util.List; +import java.util.Random; +import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.apache.iceberg.util.RandomUtil; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.ValueSource; + +public class TestShreddedArray { + private static final VariantMetadata EMPTY_METADATA = + Variants.metadata(VariantTestUtil.emptyMetadata()); + + private static final List<VariantValue> ELEMENTS = + ImmutableList.of( + Variants.of(34), Variants.of("iceberg"), Variants.of(new BigDecimal("12.21"))); + + private final Random random = new Random(871925); + + @Test + public void testShreddedFields() { Review Comment: You are right. I forgot to rename. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org