Re: [PR] Arrow: add support for null vectors [iceberg]

via GitHub Tue, 08 Oct 2024 21:45:56 -0700


amogh-jahagirdar commented on code in PR #10953:
URL: https://github.com/apache/iceberg/pull/10953#discussion_r1792822991



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +265,142 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 
ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception 
{
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), 
null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), 
null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), 
null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+    int expectedTotalRows = 1;
+
+    Schema tableSchemaV1 =
+        new Schema(
+            Types.NestedField.required(1, "a", Types.IntegerType.get()),
+            Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+    PartitionSpec spec = PartitionSpec.builderFor(tableSchemaV1).build();
+    Table table = tables.create(tableSchemaV1, spec, tableLocation);
+
+    // Add one record to the table
+    GenericRecord rec = GenericRecord.create(tableSchemaV1);
+    rec.setField("a", 1);
+    List<GenericRecord> genericRecords = Lists.newArrayList();
+    genericRecords.add(rec);
+
+    AppendFiles appendFiles = table.newAppend();
+    appendFiles.appendFile(writeParquetFile(table, genericRecords));
+    appendFiles.commit();
+
+    // Alter the table schema by adding a new, optional column.
+    // Do not add any data for this new column in the one existing row in the 
table
+    // and do not insert any new rows into the table.

Review Comment:
   I might be confusing vectorized read paths but I'm curious why this isn't 
reproducible in Spark vectorized reads? Or is it? 



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +265,142 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 
ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception 
{
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), 
null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), 
null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), 
null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+    int expectedTotalRows = 1;
+
+    Schema tableSchemaV1 =
+        new Schema(
+            Types.NestedField.required(1, "a", Types.IntegerType.get()),
+            Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+    PartitionSpec spec = PartitionSpec.builderFor(tableSchemaV1).build();
+    Table table = tables.create(tableSchemaV1, spec, tableLocation);
+
+    // Add one record to the table
+    GenericRecord rec = GenericRecord.create(tableSchemaV1);
+    rec.setField("a", 1);
+    List<GenericRecord> genericRecords = Lists.newArrayList();
+    genericRecords.add(rec);
+
+    AppendFiles appendFiles = table.newAppend();
+    appendFiles.appendFile(writeParquetFile(table, genericRecords));
+    appendFiles.commit();
+
+    // Alter the table schema by adding a new, optional column.
+    // Do not add any data for this new column in the one existing row in the 
table
+    // and do not insert any new rows into the table.

Review Comment:
   I can try and repro in a unit test for Spark and see if it's the case. To be 
clear I don't want to hold up this PR on that though since it does seem like a 
legitimate problem based on the test being done here.



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +265,142 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 
ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception 
{
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), 
null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), 
null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), 
null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+    int expectedTotalRows = 1;
+
+    Schema tableSchemaV1 =

Review Comment:
   Nit: Can just call this schema? updatedSchema below distinguishes that it 
was updated.



##########
arrow/src/main/java/org/apache/iceberg/arrow/vectorized/VectorHolder.java:
##########
@@ -140,12 +141,20 @@ public static class ConstantVectorHolder<T> extends 
VectorHolder {
     private final int numRows;
 
     public ConstantVectorHolder(int numRows) {
+      super(new NullVector("_dummy_", numRows), null, new 
NullabilityHolder(numRows));
+      nullabilityHolder().setNulls(0, numRows);
       this.numRows = numRows;
       this.constantValue = null;
     }
 
     public ConstantVectorHolder(Types.NestedField icebergField, int numRows, T 
constantValue) {
-      super(icebergField);
+      super(
+          (null == constantValue) ? new NullVector(icebergField.name(), 
numRows) : null,
+          icebergField,
+          new NullabilityHolder(numRows));
+      if (null == constantValue) {
+        nullabilityHolder().setNulls(0, numRows);
+      }

Review Comment:
   Nit: Can we add a newline after this if statement



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java:
##########
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.arrow.vectorized;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.mockito.Mockito.when;
+
+import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.types.Types;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mock;
+import org.mockito.MockitoAnnotations;
+
+class VectorHolderTest {
+  @Mock ColumnDescriptor columnDescriptor;
+  @Mock FieldVector vector;
+  @Mock Dictionary dictionary;
+  @Mock NullabilityHolder nullabilityHolder;
+  @Mock Types.NestedField icebergField;
+
+  VectorHolder vectorHolder;
+
+  @BeforeEach
+  void before() {
+    MockitoAnnotations.initMocks(this);
+    vectorHolder =
+        new VectorHolder(
+            columnDescriptor, vector, false, dictionary, nullabilityHolder, 
icebergField);
+  }
+
+  @Test
+  void testDescriptor() {
+    assertThat(vectorHolder.descriptor()).isSameAs(columnDescriptor);
+  }
+
+  @Test
+  void testVector() {
+    assertThat(vectorHolder.vector()).isSameAs(vector);
+  }
+
+  @Test
+  void testDictionary() {
+    assertThat(vectorHolder.dictionary()).isSameAs(dictionary);
+  }

Review Comment:
   Nit: Some of these tests which are just testing equality of fields don't 
seem useful. But not opposed if we want to keep them



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/VectorHolderTest.java:
##########
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.arrow.vectorized;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.mockito.Mockito.when;
+
+import org.apache.arrow.vector.FieldVector;
+import org.apache.iceberg.types.Types;
+import org.apache.parquet.column.ColumnDescriptor;
+import org.apache.parquet.column.Dictionary;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.mockito.Mock;
+import org.mockito.MockitoAnnotations;
+
+class VectorHolderTest {

Review Comment:
   Thanks for adding these tests!



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +265,142 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 
ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception 
{
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), 
null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), 
null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), 
null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+    int expectedTotalRows = 1;

Review Comment:
   Could we move these variables closer to where they're actually read? It was 
a bit hard to follow how these were used



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Arrow: add support for null vectors [iceberg]

Reply via email to