pvary commented on code in PR #8803: URL: https://github.com/apache/iceberg/pull/8803#discussion_r1372831403
########## core/src/main/java/org/apache/iceberg/BaseFile.java: ########## @@ -504,6 +508,27 @@ private static Map<Integer, ByteBuffer> toReadableByteBufferMap(Map<Integer, Byt } } + private static <TypeT> Map<Integer, TypeT> filterColumnsStats( + Map<Integer, TypeT> map, Set<Integer> columnIds) { + if (columnIds == null || columnIds.isEmpty()) { + return SerializableMap.copyOf(map); + } + + if (map == null) { + return null; + } + + Map<Integer, TypeT> filtered = Maps.newHashMapWithExpectedSize(columnIds.size()); + for (Integer columnId : columnIds) { + TypeT value = map.get(columnId); + if (value != null) { Review Comment: I did some testing with Parquet files: - Used `TestParquet` to check what `Metrics` do we generate when we have only `null` values: ``` @Test public void testNullMetricsForEmptyColumns() throws IOException { Schema schema = new Schema(optional(1, "longCol", Types.LongType.get()), optional(2, "nullCol", Types.LongType.get())); File file = createTempFile(temp); List<GenericData.Record> records = Lists.newArrayListWithCapacity(1); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); GenericData.Record smallRecord = new GenericData.Record(avroSchema); smallRecord.put("longCol", 1L); records.add(smallRecord); write( file, schema, Collections.emptyMap(), ParquetAvroWriter::buildWriter, records.toArray(new GenericData.Record[] {})); InputFile inputFile = Files.localInput(file); Metrics metrics = ParquetUtil.fileMetrics(inputFile, MetricsConfig.getDefault()); assertThat(metrics.nullValueCounts()).hasSize(2); assertThat(metrics.lowerBounds()).hasSize(1).containsKey(1); assertThat(metrics.upperBounds()).hasSize(1).containsKey(1); } ``` Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1. There is no `null` value for column '2' - Just to double check, I have tested how we read files with missing column metrics (like the ones created above). I have used `TestManifestReaderStats` for this: ``` @Test public void testReadFileWithMissingStatsIncludesFullStats() throws IOException { Metrics missingStats = new Metrics( 3L, null, ImmutableMap.of(3, 3L, 4, 4L), ImmutableMap.of(3, 3L, 4, 0L), ImmutableMap.of(3, 0L, 4, 1L), ImmutableMap.of(4, Conversions.toByteBuffer(Types.StringType.get(), "a")), ImmutableMap.of(4, Conversions.toByteBuffer(Types.StringType.get(), "a"))); DataFile fileWithMissingStats = DataFiles.builder(SPEC) .withPath(FILE_PATH) .withFileSizeInBytes(10) .withPartitionPath("data_bucket=0") // easy way to set partition data for now .withRecordCount(3) .withMetrics(missingStats) .build(); ManifestFile manifest = writeManifest(1000L, fileWithMissingStats); try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest, FILE_IO)) { CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries(); ManifestEntry<DataFile> entry = entries.iterator().next(); DataFile dataFile = entry.file(); Assert.assertEquals(3, dataFile.recordCount()); assertThat(dataFile.lowerBounds()).hasSize(1).containsKey(4); assertThat(dataFile.upperBounds()).hasSize(1).containsKey(4); } } ``` Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1. There is no `null` value for column '3' So I think the actual phrasing of the documentation is just not clean enough, as the map does not contain `null` values, it just does not contain the keys. While putting `null` value to a map is certainly possible, it should be used only in very specific cases, as it is very error-prone (see our discussion above 😄). If our goal would be to identify the case where all of the values in the given column are `null`, we can use the other stats to archive the result. ########## core/src/main/java/org/apache/iceberg/BaseFile.java: ########## @@ -504,6 +508,27 @@ private static Map<Integer, ByteBuffer> toReadableByteBufferMap(Map<Integer, Byt } } + private static <TypeT> Map<Integer, TypeT> filterColumnsStats( + Map<Integer, TypeT> map, Set<Integer> columnIds) { + if (columnIds == null || columnIds.isEmpty()) { + return SerializableMap.copyOf(map); + } + + if (map == null) { + return null; + } + + Map<Integer, TypeT> filtered = Maps.newHashMapWithExpectedSize(columnIds.size()); + for (Integer columnId : columnIds) { + TypeT value = map.get(columnId); + if (value != null) { Review Comment: I did some testing with Parquet files: - Used `TestParquet` to check what `Metrics` do we generate when we have only `null` values: ``` @Test public void testNullMetricsForEmptyColumns() throws IOException { Schema schema = new Schema(optional(1, "longCol", Types.LongType.get()), optional(2, "nullCol", Types.LongType.get())); File file = createTempFile(temp); List<GenericData.Record> records = Lists.newArrayListWithCapacity(1); org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(schema.asStruct()); GenericData.Record smallRecord = new GenericData.Record(avroSchema); smallRecord.put("longCol", 1L); records.add(smallRecord); write( file, schema, Collections.emptyMap(), ParquetAvroWriter::buildWriter, records.toArray(new GenericData.Record[] {})); InputFile inputFile = Files.localInput(file); Metrics metrics = ParquetUtil.fileMetrics(inputFile, MetricsConfig.getDefault()); assertThat(metrics.nullValueCounts()).hasSize(2); assertThat(metrics.lowerBounds()).hasSize(1).containsKey(1); assertThat(metrics.upperBounds()).hasSize(1).containsKey(1); } ``` Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1. There is no `null` value for column '2' - Just to double check, I have tested how we read files with missing column metrics (like the ones created above). I have used `TestManifestReaderStats` for this: ``` @Test public void testReadFileWithMissingStatsIncludesFullStats() throws IOException { Metrics missingStats = new Metrics( 3L, null, ImmutableMap.of(3, 3L, 4, 4L), ImmutableMap.of(3, 3L, 4, 0L), ImmutableMap.of(3, 0L, 4, 1L), ImmutableMap.of(4, Conversions.toByteBuffer(Types.StringType.get(), "a")), ImmutableMap.of(4, Conversions.toByteBuffer(Types.StringType.get(), "a"))); DataFile fileWithMissingStats = DataFiles.builder(SPEC) .withPath(FILE_PATH) .withFileSizeInBytes(10) .withPartitionPath("data_bucket=0") // easy way to set partition data for now .withRecordCount(3) .withMetrics(missingStats) .build(); ManifestFile manifest = writeManifest(1000L, fileWithMissingStats); try (ManifestReader<DataFile> reader = ManifestFiles.read(manifest, FILE_IO)) { CloseableIterable<ManifestEntry<DataFile>> entries = reader.entries(); ManifestEntry<DataFile> entry = entries.iterator().next(); DataFile dataFile = entry.file(); Assert.assertEquals(3, dataFile.recordCount()); assertThat(dataFile.lowerBounds()).hasSize(1).containsKey(4); assertThat(dataFile.upperBounds()).hasSize(1).containsKey(4); } } ``` Notice, that the size of the `lowerBounds` and the `upperBounds` map is 1. There is no `null` value for column '3' So I think the actual phrasing of the documentation is just not clean enough, as the map does not contain `null` values, it just does not contain the keys. While putting `null` value to a map is certainly possible, it should be used only in very specific cases, as it is very error-prone (see our discussion above 😄). If our goal would be to identify the case where all of the values in the given column are `null`, we can use the other stats to archive the result. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org