nastra commented on code in PR #13565: URL: https://github.com/apache/iceberg/pull/13565#discussion_r2210449156
########## data/src/test/java/org/apache/iceberg/parquet/TestParquetWriter.java: ########## @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.parquet; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; + +import java.io.ByteArrayOutputStream; +import java.io.File; +import java.io.IOException; +import java.io.RandomAccessFile; +import java.io.UncheckedIOException; +import java.nio.file.Paths; +import java.util.Collections; +import java.util.List; +import org.apache.iceberg.FileFormat; +import org.apache.iceberg.Files; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Table; +import org.apache.iceberg.TestTables; +import org.apache.iceberg.avro.Avro; +import org.apache.iceberg.data.BaseFileWriterFactory; +import org.apache.iceberg.data.GenericRecord; +import org.apache.iceberg.data.Record; +import org.apache.iceberg.data.parquet.GenericParquetWriter; +import org.apache.iceberg.exceptions.AlreadyExistsException; +import org.apache.iceberg.exceptions.RuntimeIOException; +import org.apache.iceberg.expressions.Expressions; +import org.apache.iceberg.io.FanoutDataWriter; +import org.apache.iceberg.io.FileIO; +import org.apache.iceberg.io.FileWriterFactory; +import org.apache.iceberg.io.InputFile; +import org.apache.iceberg.io.OutputFile; +import org.apache.iceberg.io.OutputFileFactory; +import org.apache.iceberg.io.PositionOutputStream; +import org.apache.iceberg.io.WriterTestBase; +import org.apache.iceberg.orc.ORC; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class TestParquetWriter extends WriterTestBase<Record> { + + private static final long TARGET_FILE_SIZE = 128L * 1024 * 1024; + private static final Logger LOG = LoggerFactory.getLogger(TestParquetWriter.class); + + private final FileFormat fileFormat = FileFormat.PARQUET; + + private OutputFileFactory fileFactory = null; + private static volatile boolean fail = true; + private static final List<File> PARQUET_FILE_LIST = + Collections.synchronizedList(Lists.newArrayList()); + + @BeforeEach + void before() { + // A lot is already done in TestBase. + this.metadataDir = new File(tableDir, "metadata"); + this.fileFactory = + OutputFileFactory.builderFor(table, 1, 1).format(fileFormat).ioSupplier(() -> IO).build(); + } + + @Test + void testParquetWriterWithFailingIO() throws IOException { + table.updateSpec().addField(Expressions.ref("data")).commit(); + + FileWriterFactory<Record> writerFactory = newWriterFactory(table.schema()); + FanoutDataWriter<Record> writer = + new FanoutDataWriter<>(writerFactory, fileFactory, IO, TARGET_FILE_SIZE); + + PartitionSpec spec = table.spec(); + + writer.write(toRow(1, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(3, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(2, "aaa"), spec, partitionKey(spec, "aaa")); + writer.write(toRow(4, "bbb"), spec, partitionKey(spec, "bbb")); + writer.write(toRow(5, "ccc"), spec, partitionKey(spec, "ccc")); + + try { + writer.close(); + } catch (IOException | UncheckedIOException e) { + LOG.warn("Error closing writer", e); + } + + // The data of the first parquet-file was first written into a byte-buffer and on close should + // have been written to file. + // But that failed, so the file should not exist: + assertThat(PARQUET_FILE_LIST.get(0)).doesNotExist(); + + // Simulate that the network is up again: + fail = false; + // Try again: + LOG.info("Trying to close {} again.", writer); + try { + writer.close(); + } catch (UncheckedIOException e) { + // This error comes from the underlying ParquetFileWriter, which seems to have a similar + // problem with setting the internal state too early. + if (!"Failed to flush row group".equals(e.getMessage())) { Review Comment: same as mentioned above. Either we're expecting an exception to happen here or not, but we generally want to avoid using try-catch blocks in tests as it's otherwise not clear what exactly is happening and why. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org