zeroshade commented on code in PR #177:
URL: https://github.com/apache/iceberg-go/pull/177#discussion_r1811229871
##########
manifest.go:
##########
@@ -567,6 +569,96 @@ func ReadManifestList(in io.Reader) ([]ManifestFile,
error) {
return out, dec.Error()
}
+// WriteManifestListV2 writes a list of v2 manifest files to an avro file.
+func WriteManifestListV2(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 2)
+}
+
+// WriteManifestListV1 writes a list of v1 manifest files to an avro file.
+func WriteManifestListV1(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 1)
+}
+
+func writeManifestList(out io.Writer, files []ManifestFile, version int) error
{
+ for _, file := range files {
+ if file.Version() != version {
+ return ErrInvalidArgument
Review Comment:
let's include more info with the error here, return `fmt.Errorf("%w:
ManifestFile '%s' has non-matching version %d instead of %d",
ErrInvalidArgument, file.FilePath(), file.Version(), version)`
##########
manifest.go:
##########
@@ -567,6 +569,96 @@ func ReadManifestList(in io.Reader) ([]ManifestFile,
error) {
return out, dec.Error()
}
+// WriteManifestListV2 writes a list of v2 manifest files to an avro file.
+func WriteManifestListV2(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 2)
+}
+
+// WriteManifestListV1 writes a list of v1 manifest files to an avro file.
+func WriteManifestListV1(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 1)
+}
+
+func writeManifestList(out io.Writer, files []ManifestFile, version int) error
{
+ for _, file := range files {
+ if file.Version() != version {
+ return ErrInvalidArgument
+ }
+ }
+
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestListV1Key
+ case 2:
+ key = internal.ManifestListV2Key
+ default:
+ return ErrInvalidArgument
+ }
+
+ enc, err := ocf.NewEncoder(
+ internal.AvroSchemaCache.Get(key).String(),
+ out, ocf.WithMetadata(map[string][]byte{
+ "format-version": []byte(strconv.Itoa(version)),
+ "avro.codec": []byte("deflate"),
+ }),
+ ocf.WithCodec(ocf.Deflate),
+ )
+ if err != nil {
+ return err
+ }
+
+ for _, file := range files {
+ if err := enc.Encode(file); err != nil {
+ return err
+ }
+ }
+
+ return enc.Close()
+}
+
+// WriteManifestEntriesV2 writes a list of v2 manifest entries to an avro file.
+func WriteManifestEntriesV2(out io.Writer, entries []ManifestEntry) error {
+ return writeManifestEntries(out, entries, 2)
+}
+
+// WriteManifestEntriesV1 writes a list of v1 manifest entries to an avro file.
+func WriteManifestEntriesV1(out io.Writer, entries []ManifestEntry) error {
+ return writeManifestEntries(out, entries, 1)
+}
+
+func writeManifestEntries(out io.Writer, entries []ManifestEntry, version int)
error {
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestEntryV1Key
+ case 2:
+ key = internal.ManifestEntryV2Key
+ default:
+ return ErrInvalidArgument
+ }
+
+ enc, err := ocf.NewEncoder(
+ internal.AvroSchemaCache.Get(key).String(),
+ out, ocf.WithMetadata(map[string][]byte{
+ "format-version": []byte(strconv.Itoa(version)),
+ "avro.codec": []byte("deflate"),
+ }),
Review Comment:
same comment as above
##########
manifest.go:
##########
@@ -876,7 +1030,140 @@ func (m *manifestEntryV2) FileSequenceNum() *int64 {
return m.FileSeqNum
}
-func (m *manifestEntryV2) DataFile() DataFile { return &m.Data }
+func (m *manifestEntryV2) DataFile() DataFile { return m.Data }
+
+// DataFileBuilder is a helper for building a data file struct which will
+// conform to the DataFile interface.
+type DataFileBuilder struct {
+ d *dataFile
+}
+
+// NewDataFileBuilder is passed all of the required fields and then allows
+// all of the optional fields to be set by calling the corresponding methods
+// before calling [DataFileBuilder.Build] to construct the object.
+func NewDataFileBuilder(
+ content ManifestEntryContent,
+ path string,
+ format FileFormat,
+ partitionData map[string]any,
+ recordCount int64,
+ fileSize int64,
+) *DataFileBuilder {
+ return &DataFileBuilder{
+ d: &dataFile{
+ Content: content,
+ Path: path,
+ Format: format,
+ PartitionData: partitionData,
+ RecordCount: recordCount,
+ FileSize: fileSize,
+ },
+ }
+}
Review Comment:
we should probably add some validation in this rather than blindly creating
it
##########
manifest.go:
##########
@@ -876,7 +1030,140 @@ func (m *manifestEntryV2) FileSequenceNum() *int64 {
return m.FileSeqNum
}
-func (m *manifestEntryV2) DataFile() DataFile { return &m.Data }
+func (m *manifestEntryV2) DataFile() DataFile { return m.Data }
+
+// DataFileBuilder is a helper for building a data file struct which will
+// conform to the DataFile interface.
+type DataFileBuilder struct {
+ d *dataFile
+}
+
+// NewDataFileBuilder is passed all of the required fields and then allows
+// all of the optional fields to be set by calling the corresponding methods
+// before calling [DataFileBuilder.Build] to construct the object.
+func NewDataFileBuilder(
+ content ManifestEntryContent,
+ path string,
+ format FileFormat,
+ partitionData map[string]any,
+ recordCount int64,
+ fileSize int64,
+) *DataFileBuilder {
+ return &DataFileBuilder{
+ d: &dataFile{
+ Content: content,
+ Path: path,
+ Format: format,
+ PartitionData: partitionData,
+ RecordCount: recordCount,
+ FileSize: fileSize,
+ },
+ }
+}
+
+// BlockSizeInBytes sets the block size in bytes for the data file. Deprecated
in v2.
+func (b *DataFileBuilder) BlockSizeInBytes(size int64) *DataFileBuilder {
+ b.d.BlockSizeInBytes = size
+ return b
+}
+
+// ColumnSizes sets the column sizes for the data file.
+func (b *DataFileBuilder) ColumnSizes(sizes map[int]int64) *DataFileBuilder {
+ colSizes := make([]colMap[int, int64], 0, len(sizes))
+ for k, v := range sizes {
+ colSizes = append(colSizes, colMap[int, int64]{Key: k, Value:
v})
+ }
+ b.d.ColSizes = &colSizes
+ return b
+}
+
+// ValueCounts sets the value counts for the data file.
+func (b *DataFileBuilder) ValueCounts(counts map[int]int64) *DataFileBuilder {
+ vals := make([]colMap[int, int64], 0, len(counts))
+ for k, v := range counts {
+ vals = append(vals, colMap[int, int64]{Key: k, Value: v})
+ }
Review Comment:
could probably also make a helper function to do this instead of repeating
the code for each of the types
##########
manifest.go:
##########
@@ -876,7 +1030,140 @@ func (m *manifestEntryV2) FileSequenceNum() *int64 {
return m.FileSeqNum
}
-func (m *manifestEntryV2) DataFile() DataFile { return &m.Data }
+func (m *manifestEntryV2) DataFile() DataFile { return m.Data }
+
+// DataFileBuilder is a helper for building a data file struct which will
+// conform to the DataFile interface.
+type DataFileBuilder struct {
+ d *dataFile
+}
+
+// NewDataFileBuilder is passed all of the required fields and then allows
+// all of the optional fields to be set by calling the corresponding methods
+// before calling [DataFileBuilder.Build] to construct the object.
+func NewDataFileBuilder(
+ content ManifestEntryContent,
+ path string,
+ format FileFormat,
+ partitionData map[string]any,
+ recordCount int64,
+ fileSize int64,
+) *DataFileBuilder {
+ return &DataFileBuilder{
+ d: &dataFile{
+ Content: content,
+ Path: path,
+ Format: format,
+ PartitionData: partitionData,
+ RecordCount: recordCount,
+ FileSize: fileSize,
+ },
+ }
+}
+
+// BlockSizeInBytes sets the block size in bytes for the data file. Deprecated
in v2.
+func (b *DataFileBuilder) BlockSizeInBytes(size int64) *DataFileBuilder {
+ b.d.BlockSizeInBytes = size
+ return b
+}
+
+// ColumnSizes sets the column sizes for the data file.
+func (b *DataFileBuilder) ColumnSizes(sizes map[int]int64) *DataFileBuilder {
+ colSizes := make([]colMap[int, int64], 0, len(sizes))
+ for k, v := range sizes {
+ colSizes = append(colSizes, colMap[int, int64]{Key: k, Value:
v})
+ }
+ b.d.ColSizes = &colSizes
+ return b
Review Comment:
same as before, we should probably have some validation in here, such as
ensuring the column numbers are valid etc.
##########
manifest.go:
##########
@@ -567,6 +569,96 @@ func ReadManifestList(in io.Reader) ([]ManifestFile,
error) {
return out, dec.Error()
}
+// WriteManifestListV2 writes a list of v2 manifest files to an avro file.
+func WriteManifestListV2(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 2)
+}
+
+// WriteManifestListV1 writes a list of v1 manifest files to an avro file.
+func WriteManifestListV1(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 1)
+}
+
+func writeManifestList(out io.Writer, files []ManifestFile, version int) error
{
+ for _, file := range files {
+ if file.Version() != version {
+ return ErrInvalidArgument
+ }
+ }
+
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestListV1Key
+ case 2:
+ key = internal.ManifestListV2Key
+ default:
+ return ErrInvalidArgument
+ }
+
+ enc, err := ocf.NewEncoder(
+ internal.AvroSchemaCache.Get(key).String(),
+ out, ocf.WithMetadata(map[string][]byte{
+ "format-version": []byte(strconv.Itoa(version)),
+ "avro.codec": []byte("deflate"),
+ }),
+ ocf.WithCodec(ocf.Deflate),
+ )
+ if err != nil {
+ return err
+ }
+
+ for _, file := range files {
+ if err := enc.Encode(file); err != nil {
+ return err
+ }
+ }
+
+ return enc.Close()
+}
+
+// WriteManifestEntriesV2 writes a list of v2 manifest entries to an avro file.
+func WriteManifestEntriesV2(out io.Writer, entries []ManifestEntry) error {
+ return writeManifestEntries(out, entries, 2)
+}
+
+// WriteManifestEntriesV1 writes a list of v1 manifest entries to an avro file.
+func WriteManifestEntriesV1(out io.Writer, entries []ManifestEntry) error {
+ return writeManifestEntries(out, entries, 1)
+}
+
+func writeManifestEntries(out io.Writer, entries []ManifestEntry, version int)
error {
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestEntryV1Key
+ case 2:
+ key = internal.ManifestEntryV2Key
+ default:
+ return ErrInvalidArgument
Review Comment:
same comment as above
##########
manifest.go:
##########
@@ -567,6 +569,96 @@ func ReadManifestList(in io.Reader) ([]ManifestFile,
error) {
return out, dec.Error()
}
+// WriteManifestListV2 writes a list of v2 manifest files to an avro file.
+func WriteManifestListV2(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 2)
+}
+
+// WriteManifestListV1 writes a list of v1 manifest files to an avro file.
+func WriteManifestListV1(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 1)
+}
+
+func writeManifestList(out io.Writer, files []ManifestFile, version int) error
{
+ for _, file := range files {
+ if file.Version() != version {
+ return ErrInvalidArgument
+ }
+ }
+
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestListV1Key
+ case 2:
+ key = internal.ManifestListV2Key
+ default:
+ return ErrInvalidArgument
Review Comment:
same as above, include more info with the error, `fmt.Errorf("%w:
non-recognized version %d", ErrInvalidArgument, version)`
##########
manifest.go:
##########
@@ -831,14 +946,53 @@ func (m *manifestEntryV1) FileSequenceNum() *int64 {
return m.FileSeqNum
}
-func (m *manifestEntryV1) DataFile() DataFile { return &m.Data }
+func (m *manifestEntryV1) DataFile() DataFile { return m.Data }
+
+// ManifestEntryV2Builder is a helper for building a V2 manifest entry
+// struct which will conform to the ManifestEntry interface.
+type ManifestEntryV2Builder struct {
+ m *manifestEntryV2
+}
+
+// NewManifestEntryV2Builder is passed all of the required fields and then
allows
+// all of the optional fields to be set by calling the corresponding methods
+// before calling [ManifestEntryV2Builder.Build] to construct the object.
+func NewManifestEntryV2Builder(status ManifestEntryStatus, snapshotID int64,
data DataFile) *ManifestEntryV2Builder {
+ return &ManifestEntryV2Builder{
+ m: &manifestEntryV2{
+ EntryStatus: status,
+ Snapshot: &snapshotID,
+ Data: data,
Review Comment:
since we know that, at least currently, the only implementation of
`DataFile` is the internal `dataFile` type, we could simply use a type assert
`data.(*dataFile)` to avoid having to use `DataFile` in the manifestlist and
builders and instead use the underlying internal type.
##########
manifest.go:
##########
@@ -567,6 +569,96 @@ func ReadManifestList(in io.Reader) ([]ManifestFile,
error) {
return out, dec.Error()
}
+// WriteManifestListV2 writes a list of v2 manifest files to an avro file.
+func WriteManifestListV2(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 2)
+}
+
+// WriteManifestListV1 writes a list of v1 manifest files to an avro file.
+func WriteManifestListV1(out io.Writer, files []ManifestFile) error {
+ return writeManifestList(out, files, 1)
+}
+
+func writeManifestList(out io.Writer, files []ManifestFile, version int) error
{
+ for _, file := range files {
+ if file.Version() != version {
+ return ErrInvalidArgument
+ }
+ }
+
+ var key string
+ switch version {
+ case 1:
+ key = internal.ManifestListV1Key
+ case 2:
+ key = internal.ManifestListV2Key
+ default:
+ return ErrInvalidArgument
+ }
+
+ enc, err := ocf.NewEncoder(
+ internal.AvroSchemaCache.Get(key).String(),
Review Comment:
use `NewEncoderWithSchema(internal.AvroSchemaCache.Get(key), out, ...)` and
skip the conversion to string
do we need to manually add `avro.schema` key? or will that be added
automatically by the `ocf` encoder?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]