kevinjqliu commented on code in PR #330: URL: https://github.com/apache/iceberg-go/pull/330#discussion_r1997684846
########## table/arrow_utils.go: ########## @@ -1250,5 +1656,56 @@ func dataFileStatsFromParquetMetadata(pqmeta *metadata.FileMetaData, statsCols m nullValueCounts: nullValueCounts, nanValueCounts: nanValueCounts, splitOffsets: splitOffsets, + colAggs: colAggs, + } +} + +func parquetFilesToDataFiles(fileIO iceio.IO, meta *MetadataBuilder, paths iter.Seq[string]) iter.Seq2[iceberg.DataFile, error] { + return func(yield func(iceberg.DataFile, error) bool) { + defer func() { + if r := recover(); r != nil { + switch e := r.(type) { + case string: + yield(nil, fmt.Errorf("error encountered during parquet file conversion: %s", e)) + case error: + yield(nil, fmt.Errorf("error encountered during parquet file conversion: %w", e)) + } + } + }() + + for filePath := range paths { + inputFile := must(fileIO.Open(filePath)) + defer inputFile.Close() + + rdr := must(file.NewParquetReader(inputFile)) + defer rdr.Close() + + arrRdr := must(pqarrow.NewFileReader(rdr, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)) + arrSchema := must(arrRdr.Schema()) + + if hasIDs := must(VisitArrowSchema(arrSchema, hasIDs{})); hasIDs { + yield(nil, fmt.Errorf("%w: cannot add file %s because it has field-ids. add-files only supports the addition of files without field_ids", + iceberg.ErrNotImplemented, filePath)) + + return + } + + if err := checkArrowSchemaCompat(meta.CurrentSchema(), arrSchema, false); err != nil { + panic(err) + } + + statistics := dataFileStatsFromParquetMetadata(rdr.MetaData(), + must(computeStatsPlan(meta.CurrentSchema(), meta.props)), + must(parquetPathToIDMapping(meta.CurrentSchema()))) + + df, err := statistics.toDataFile(meta.CurrentSpec(), filePath, iceberg.ParquetFile, rdr.MetaData().GetSourceFileSize()) Review Comment: `toDataFile` is using a builder so this is not a problem -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org