wombatu-kun commented on code in PR #18403:
URL: https://github.com/apache/hudi/pull/18403#discussion_r3036842094


##########
hudi-client/hudi-spark-client/src/main/java/org/apache/hudi/io/storage/LanceBatchIterator.java:
##########
@@ -0,0 +1,167 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hudi.io.storage;
+
+import org.apache.hudi.exception.HoodieException;
+import org.apache.hudi.exception.HoodieIOException;
+
+import org.apache.arrow.memory.BufferAllocator;
+import org.apache.arrow.vector.VectorSchemaRoot;
+import org.apache.arrow.vector.ipc.ArrowReader;
+import org.apache.spark.sql.vectorized.ColumnVector;
+import org.apache.spark.sql.vectorized.ColumnarBatch;
+import org.apache.spark.sql.vectorized.LanceArrowColumnVector;
+import org.lance.file.LanceFileReader;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+/**
+ * Iterator that returns {@link ColumnarBatch} directly from Lance files 
without
+ * decomposing to individual rows. Used for vectorized/columnar batch reading
+ * in Spark's COW base-file-only read path.
+ *
+ * <p>Unlike {@link LanceRecordIterator} which extracts rows one by one,
+ * this iterator preserves the columnar format for zero-copy batch processing.
+ *
+ * <p>Manages the lifecycle of:
+ * <ul>
+ *   <li>BufferAllocator - Arrow memory management</li>
+ *   <li>LanceFileReader - Lance file handle</li>
+ *   <li>ArrowReader - Arrow batch reader</li>
+ * </ul>
+ */
+public class LanceBatchIterator implements Iterator<ColumnarBatch>, Closeable {
+  private final BufferAllocator allocator;
+  private final LanceFileReader lanceReader;
+  private final ArrowReader arrowReader;
+  private final String path;
+
+  private ColumnVector[] columnVectors;
+  private ColumnarBatch currentBatch;
+  private boolean nextBatchLoaded = false;
+  private boolean finished = false;
+  private boolean closed = false;
+
+  /**
+   * Creates a new Lance batch iterator.
+   *
+   * @param allocator   Arrow buffer allocator for memory management
+   * @param lanceReader Lance file reader
+   * @param arrowReader Arrow reader for batch reading
+   * @param path        File path (for error messages)
+   */
+  public LanceBatchIterator(BufferAllocator allocator,
+                            LanceFileReader lanceReader,
+                            ArrowReader arrowReader,
+                            String path) {
+    this.allocator = allocator;
+    this.lanceReader = lanceReader;
+    this.arrowReader = arrowReader;
+    this.path = path;
+  }
+
+  @Override
+  public boolean hasNext() {
+    if (finished) {
+      return false;
+    }
+    if (nextBatchLoaded) {
+      return true;
+    }
+
+    try {
+      if (arrowReader.loadNextBatch()) {
+        VectorSchemaRoot root = arrowReader.getVectorSchemaRoot();
+
+        // Create column vector wrappers once and reuse across batches
+        // (ArrowReader reuses the same VectorSchemaRoot)
+        if (columnVectors == null) {
+          columnVectors = root.getFieldVectors().stream()
+              .map(LanceArrowColumnVector::new)
+              .toArray(ColumnVector[]::new);
+        }
+
+        currentBatch = new ColumnarBatch(columnVectors, root.getRowCount());
+        nextBatchLoaded = true;
+        return true;
+      }
+    } catch (IOException e) {
+      throw new HoodieException("Failed to read next batch from Lance file: " 
+ path, e);
+    }
+
+    finished = true;
+    return false;
+  }
+
+  @Override
+  public ColumnarBatch next() {
+    if (!hasNext()) {
+      throw new NoSuchElementException("No more batches available");
+    }
+    nextBatchLoaded = false;
+    return currentBatch;
+  }
+
+  @Override
+  public void close() {
+    if (closed) {
+      return;
+    }
+    closed = true;
+
+    IOException arrowException = null;
+    Exception lanceException = null;
+
+    // Don't close currentBatch here: ColumnarBatch.close() would close the
+    // underlying Arrow FieldVectors through LanceArrowColumnVector, but they
+    // are owned by the ArrowReader (via VectorSchemaRoot) and will be closed
+    // when arrowReader.close() is called below.
+    currentBatch = null;
+
+    if (arrowReader != null) {
+      try {
+        arrowReader.close();
+      } catch (IOException e) {
+        arrowException = e;
+      }
+    }
+
+    if (lanceReader != null) {
+      try {
+        lanceReader.close();
+      } catch (Exception e) {
+        lanceException = e;
+      }
+    }
+

Review Comment:
   Fixed — `allocator.close()` is now wrapped in try-catch. If it throws, prior 
exceptions from `arrowReader`/`lanceReader` are attached as suppressed. The 
propagation order is: allocator exception > arrow exception > lance exception, 
with earlier ones added as suppressed to the one that's thrown.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]

Reply via email to