Re: [PR] Closing the ParquetWriter in the correct spot [iceberg]

via GitHub Wed, 16 Jul 2025 06:32:20 -0700


nastra commented on code in PR #13565:
URL: https://github.com/apache/iceberg/pull/13565#discussion_r2210449156



##########
data/src/test/java/org/apache/iceberg/parquet/TestParquetWriter.java:
##########
@@ -0,0 +1,309 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.parquet;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.IOException;
+import java.io.RandomAccessFile;
+import java.io.UncheckedIOException;
+import java.nio.file.Paths;
+import java.util.Collections;
+import java.util.List;
+import org.apache.iceberg.FileFormat;
+import org.apache.iceberg.Files;
+import org.apache.iceberg.PartitionSpec;
+import org.apache.iceberg.Schema;
+import org.apache.iceberg.Table;
+import org.apache.iceberg.TestTables;
+import org.apache.iceberg.avro.Avro;
+import org.apache.iceberg.data.BaseFileWriterFactory;
+import org.apache.iceberg.data.GenericRecord;
+import org.apache.iceberg.data.Record;
+import org.apache.iceberg.data.parquet.GenericParquetWriter;
+import org.apache.iceberg.exceptions.AlreadyExistsException;
+import org.apache.iceberg.exceptions.RuntimeIOException;
+import org.apache.iceberg.expressions.Expressions;
+import org.apache.iceberg.io.FanoutDataWriter;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.FileWriterFactory;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.io.OutputFile;
+import org.apache.iceberg.io.OutputFileFactory;
+import org.apache.iceberg.io.PositionOutputStream;
+import org.apache.iceberg.io.WriterTestBase;
+import org.apache.iceberg.orc.ORC;
+import org.apache.iceberg.relocated.com.google.common.collect.Lists;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class TestParquetWriter extends WriterTestBase<Record> {
+
+  private static final long TARGET_FILE_SIZE = 128L * 1024 * 1024;
+  private static final Logger LOG = 
LoggerFactory.getLogger(TestParquetWriter.class);
+
+  private final FileFormat fileFormat = FileFormat.PARQUET;
+
+  private OutputFileFactory fileFactory = null;
+  private static volatile boolean fail = true;
+  private static final List<File> PARQUET_FILE_LIST =
+      Collections.synchronizedList(Lists.newArrayList());
+
+  @BeforeEach
+  void before() {
+    // A lot is already done in TestBase.
+    this.metadataDir = new File(tableDir, "metadata");
+    this.fileFactory =
+        OutputFileFactory.builderFor(table, 1, 
1).format(fileFormat).ioSupplier(() -> IO).build();
+  }
+
+  @Test
+  void testParquetWriterWithFailingIO() throws IOException {
+    table.updateSpec().addField(Expressions.ref("data")).commit();
+
+    FileWriterFactory<Record> writerFactory = newWriterFactory(table.schema());
+    FanoutDataWriter<Record> writer =
+        new FanoutDataWriter<>(writerFactory, fileFactory, IO, 
TARGET_FILE_SIZE);
+
+    PartitionSpec spec = table.spec();
+
+    writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa"));
+    writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb"));
+    writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa"));
+    writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb"));
+    writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc"));
+
+    try {
+      writer.close();
+    } catch (IOException | UncheckedIOException e) {
+      LOG.warn("Error closing writer", e);
+    }
+
+    // The data of the first parquet-file was first written into a byte-buffer 
and on close should
+    // have been written to file.
+    // But that failed, so the file should not exist:
+    assertThat(PARQUET_FILE_LIST.get(0)).doesNotExist();
+
+    // Simulate that the network is up again:
+    fail = false;
+    // Try again:
+    LOG.info("Trying to close {} again.", writer);
+    try {
+      writer.close();
+    } catch (UncheckedIOException e) {
+      // This error comes from the underlying ParquetFileWriter, which seems 
to have a similar
+      // problem with setting the internal state too early.
+      if (!"Failed to flush row group".equals(e.getMessage())) {

Review Comment:
   same as mentioned above. Either we're expecting an exception to happen here 
or not, but we generally want to avoid using try-catch blocks in tests as it's 
otherwise not clear what exactly is happening and why.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Closing the ParquetWriter in the correct spot [iceberg]

Reply via email to