stevenzwu commented on code in PR #10200: URL: https://github.com/apache/iceberg/pull/10200#discussion_r1578676203
########## data/src/test/java/org/apache/iceberg/io/TestTaskEqualityDeltaWriter.java: ########## @@ -409,6 +421,55 @@ public void testUpsertDataWithFullRowSchema() throws IOException { .isEqualTo(ImmutableList.of(posRecord.copy("file_path", dataFile.path(), "pos", 0L))); } + @TestTemplate + public void testDeleteFileGranularity() throws IOException { + withGranularity(DeleteGranularity.FILE); + } + + @TestTemplate + public void testDeletePartitionGranularity() throws IOException { + withGranularity(DeleteGranularity.PARTITION); + } + + private void withGranularity(DeleteGranularity granularity) throws IOException { + List<Integer> eqDeleteFieldIds = Lists.newArrayList(idFieldId, dataFieldId); + Schema eqDeleteRowSchema = table.schema(); + + GenericTaskDeltaWriter deltaWriter = + createTaskWriter(eqDeleteFieldIds, eqDeleteRowSchema, granularity); + + Map<Integer, Record> expected = Maps.newHashMapWithExpectedSize(2000); + // Create enough records, so we have multiple files + for (int i = 0; i < 2000; ++i) { + Record record = createRecord(i, "aaa" + i); + deltaWriter.write(record); + if (i % 5 == 10) { + deltaWriter.delete(record); + } else { + expected.put(i, record); + } + } + + // Add some deletes in the end + for (int i = 0; i < 199; ++i) { + int id = i * 10 + 1; + Record record = createRecord(id, "aaa" + id); + deltaWriter.delete(record); + expected.remove(id); + } + + WriteResult result = deltaWriter.complete(); + assertThat(result.dataFiles()).as("Should have 2 data files.").hasSize(2); Review Comment: > The size is only checked every 1000 records - so 2000 records, max 2 data files I see. that's the default Parquet writer config. it is not obvious when reading the code. might add a comment or set the parquet size check config explicitly in the code? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org