Re: [PR] Arrow: add support for null vectors [iceberg]

via GitHub Wed, 16 Oct 2024 00:51:53 -0700


nastra commented on code in PR #10953:
URL: https://github.com/apache/iceberg/pull/10953#discussion_r1802544786



##########
arrow/src/test/java/org/apache/iceberg/arrow/vectorized/ArrowReaderTest.java:
##########
@@ -262,6 +265,143 @@ public void testReadColumnFilter2() throws Exception {
         scan, NUM_ROWS_PER_MONTH, 12 * NUM_ROWS_PER_MONTH, 
ImmutableList.of("timestamp"));
   }
 
+  @Test
+  public void testReadColumnThatDoesNotExistInParquetSchema() throws Exception 
{
+    rowsWritten = Lists.newArrayList();
+    tables = new HadoopTables();
+
+    List<Field> expectedFields =
+        ImmutableList.of(
+            new Field("a", new FieldType(false, MinorType.INT.getType(), 
null), null),
+            new Field("b", new FieldType(true, MinorType.INT.getType(), null), 
null),
+            new Field("z", new FieldType(true, MinorType.NULL.getType(), 
null), null));
+    org.apache.arrow.vector.types.pojo.Schema expectedSchema =
+        new org.apache.arrow.vector.types.pojo.Schema(expectedFields);
+
+    Schema schema =
+        new Schema(
+            Types.NestedField.required(1, "a", Types.IntegerType.get()),
+            Types.NestedField.optional(2, "b", Types.IntegerType.get()));
+
+    PartitionSpec spec = PartitionSpec.builderFor(schema).build();
+    Table table = tables.create(schema, spec, tableLocation);
+
+    // Add one record to the table
+    GenericRecord rec = GenericRecord.create(schema);
+    rec.setField("a", 1);
+    List<GenericRecord> genericRecords = Lists.newArrayList();
+    genericRecords.add(rec);
+
+    AppendFiles appendFiles = table.newAppend();
+    appendFiles.appendFile(writeParquetFile(table, genericRecords));
+    appendFiles.commit();
+
+    // Alter the table schema by adding a new, optional column.
+    // Do not add any data for this new column in the one existing row in the 
table
+    // and do not insert any new rows into the table.
+    UpdateSchema updateSchema = table.updateSchema().addColumn("z", 
Types.IntegerType.get());
+    updateSchema.apply();
+    updateSchema.commit();
+
+    // Select all columns, all rows from the table
+    TableScan scan = table.newScan().select("*");
+
+    int batchSize = 1;
+    int expectedNumRowsPerBatch = 1;
+
+    Set<String> columns = ImmutableSet.of("a", "b", "z");
+    // Read the data and verify that the returned ColumnarBatches match 
expected rows.
+    try (VectorizedTableScanIterable itr =
+        new VectorizedTableScanIterable(scan, batchSize, false)) {
+      int rowIndex = 0;
+      for (ColumnarBatch batch : itr) {
+        List<GenericRecord> expectedRows =
+            rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch);
+        rowIndex++;
+
+        assertThat(batch.numRows()).isEqualTo(expectedNumRowsPerBatch);
+        assertThat(batch.numCols()).isEqualTo(columns.size());
+
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            0,
+            columns,
+            "a",
+            (records, i) -> records.get(i).getField("a"),
+            ColumnVector::getInt);
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            1,
+            columns,
+            "b",
+            (records, i) -> records.get(i).getField("b"),
+            (columnVector, i) -> columnVector.isNullAt(i) ? null : 
columnVector.getInt(i));
+        checkColumnarArrayValues(
+            expectedNumRowsPerBatch,
+            expectedRows,
+            batch,
+            2,
+            columns,
+            "z",
+            (records, i) -> records.get(i).getField("z"),
+            (columnVector, i) -> columnVector.isNullAt(i) ? null : 
columnVector.getInt(i));
+      }
+    }
+
+    int expectedTotalRows = 1;
+
+    // Read the data and verify that the returned Arrow VectorSchemaRoots 
match expected rows.
+    try (VectorizedTableScanIterable itr =
+        new VectorizedTableScanIterable(scan, batchSize, false)) {
+      int totalRows = 0;
+      int rowIndex = 0;
+      for (ColumnarBatch batch : itr) {
+        List<GenericRecord> expectedRows =
+            rowsWritten.subList(rowIndex, rowIndex + expectedNumRowsPerBatch);
+        rowIndex++;
+        VectorSchemaRoot root = batch.createVectorSchemaRootFromVectors();
+        assertThat(root.getSchema()).isEqualTo(expectedSchema);
+
+        // check all vector types
+        assertThat(root.getVector("a").getClass()).isEqualTo(IntVector.class);

Review Comment:
   ```suggestion
           assertThat(root.getVector("a")).isInstanceOf(IntVector.class);
   ```
   please also update all other places in this test class



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Arrow: add support for null vectors [iceberg]

Reply via email to