kevinjqliu commented on code in PR #330:
URL: https://github.com/apache/iceberg-go/pull/330#discussion_r1997684846


##########
table/arrow_utils.go:
##########
@@ -1250,5 +1656,56 @@ func dataFileStatsFromParquetMetadata(pqmeta 
*metadata.FileMetaData, statsCols m
                nullValueCounts: nullValueCounts,
                nanValueCounts:  nanValueCounts,
                splitOffsets:    splitOffsets,
+               colAggs:         colAggs,
+       }
+}
+
+func parquetFilesToDataFiles(fileIO iceio.IO, meta *MetadataBuilder, paths 
iter.Seq[string]) iter.Seq2[iceberg.DataFile, error] {
+       return func(yield func(iceberg.DataFile, error) bool) {
+               defer func() {
+                       if r := recover(); r != nil {
+                               switch e := r.(type) {
+                               case string:
+                                       yield(nil, fmt.Errorf("error 
encountered during parquet file conversion: %s", e))
+                               case error:
+                                       yield(nil, fmt.Errorf("error 
encountered during parquet file conversion: %w", e))
+                               }
+                       }
+               }()
+
+               for filePath := range paths {
+                       inputFile := must(fileIO.Open(filePath))
+                       defer inputFile.Close()
+
+                       rdr := must(file.NewParquetReader(inputFile))
+                       defer rdr.Close()
+
+                       arrRdr := must(pqarrow.NewFileReader(rdr, 
pqarrow.ArrowReadProperties{}, memory.DefaultAllocator))
+                       arrSchema := must(arrRdr.Schema())
+
+                       if hasIDs := must(VisitArrowSchema(arrSchema, 
hasIDs{})); hasIDs {
+                               yield(nil, fmt.Errorf("%w: cannot add file %s 
because it has field-ids. add-files only supports the addition of files without 
field_ids",
+                                       iceberg.ErrNotImplemented, filePath))
+
+                               return
+                       }
+
+                       if err := checkArrowSchemaCompat(meta.CurrentSchema(), 
arrSchema, false); err != nil {
+                               panic(err)
+                       }
+
+                       statistics := 
dataFileStatsFromParquetMetadata(rdr.MetaData(),
+                               must(computeStatsPlan(meta.CurrentSchema(), 
meta.props)),
+                               
must(parquetPathToIDMapping(meta.CurrentSchema())))
+
+                       df, err := statistics.toDataFile(meta.CurrentSpec(), 
filePath, iceberg.ParquetFile, rdr.MetaData().GetSourceFileSize())

Review Comment:
   `toDataFile` is using a builder so this is not a problem 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to