zeroshade commented on code in PR #307:
URL: https://github.com/apache/iceberg-go/pull/307#discussion_r1958843788


##########
utils.go:
##########
@@ -238,3 +239,68 @@ func avroEncode[T any](key string, version int, vals []T, 
out io.Writer) error {
 
        return enc.Close()
 }
+
+func structTypeToAvroPartitionSchema(st *StructType) (avro.Schema, error) {
+       var aFields []*avro.Field
+
+       for _, field := range st.Fields() {
+               aField, err := nestedFieldToAvroField(field)
+
+               if err != nil {
+                       return nil, err
+               }
+
+               aFields = append(aFields, aField)
+       }
+
+       return avro.NewRecordSchema("r102", "", aFields)
+}
+
+func nestedFieldToAvroField(f NestedField) (*avro.Field, error) {
+       sch, err := nestedFieldToAvroSchema(f)
+
+       if err != nil {
+               return nil, err
+       }
+
+       return avro.NewField(
+               f.Name,
+               sch,
+               avro.WithDoc(f.Doc),
+               avro.WithProps(map[string]any{"field-id": f.ID}),
+       )
+}
+
+func nestedFieldToAvroSchema(f NestedField) (avro.Schema, error) {
+       var sch avro.Schema
+
+       switch f.Type.(type) {
+       case *StringType:
+               sch = avro.NewPrimitiveSchema(avro.String, nil)
+       case *Int32Type:
+               sch = avro.NewPrimitiveSchema(avro.Int, nil)
+       case *Int64Type:
+               sch = avro.NewPrimitiveSchema(avro.Long, nil)
+       case *BinaryType:
+               sch = avro.NewPrimitiveSchema(avro.Bytes, nil)
+       case *BooleanType:
+               sch = avro.NewPrimitiveSchema(avro.Boolean, nil)
+       case *Float32Type:
+               sch = avro.NewPrimitiveSchema(avro.Float, nil)
+       case *Float64Type:
+               sch = avro.NewPrimitiveSchema(avro.Double, nil)
+       case *DateType:
+               sch = avro.NewPrimitiveSchema(avro.Int, 
avro.NewPrimitiveLogicalSchema(avro.Date))
+       default:

Review Comment:
   missing some types here



##########
internal/manifest_list_v1.go:
##########
@@ -0,0 +1,142 @@
+package internal
+
+import (
+       "github.com/hamba/avro/v2"
+)
+
+func MustNewManifestListV1Schema() avro.Schema {
+       return MustNewRecordSchema("manifest_file", "", []*avro.Field{
+               MustNewField(
+                       "manifest_path",
+                       avro.NewPrimitiveSchema(avro.String, nil),
+                       avro.WithDoc("Location URI with FS scheme"),
+                       avro.WithProps(map[string]any{"field-id": 500}),
+               ),
+               MustNewField(
+                       "manifest_length",
+                       avro.NewPrimitiveSchema(avro.Long, nil),
+                       avro.WithDoc("Total file size in bytes"),
+                       avro.WithProps(map[string]any{"field-id": 501}),
+               ),
+               MustNewField(
+                       "partition_spec_id",
+                       avro.NewPrimitiveSchema(avro.Int, nil),
+                       avro.WithDoc("Spec ID used to write"),
+                       avro.WithProps(map[string]any{"field-id": 502}),
+               ),
+               MustNewField(
+                       "added_snapshot_id",
+                       avro.NewPrimitiveSchema(avro.Long, nil),
+                       avro.WithDoc("Snapshot ID that added the manifest"),
+                       avro.WithProps(map[string]any{"field-id": 503}),
+               ),
+               MustNewField(
+                       "added_data_files_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Int, nil),
+                       }),
+                       avro.WithDoc("Added entry count"),
+                       avro.WithProps(map[string]any{"field-id": 504}),
+               ),
+               MustNewField(
+                       "existing_data_files_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Int, nil),
+                       }),
+                       avro.WithDoc("Existing entry count"),
+                       avro.WithProps(map[string]any{"field-id": 505}),
+               ),
+               MustNewField(
+                       "deleted_data_files_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Int, nil),
+                       }),
+                       avro.WithDoc("Deleted entry count"),
+                       avro.WithProps(map[string]any{"field-id": 506}),
+               ),
+               MustNewField(
+                       "partitions",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewArraySchema(
+                                       MustNewPartitionSchema(),
+                                       
avro.WithProps(map[string]any{"element-id": 508}),
+                               ),
+                       }),
+                       avro.WithDoc("Summary for each partition"),
+                       avro.WithProps(map[string]any{"field-id": 507}),
+               ),
+               MustNewField(
+                       "added_rows_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Long, nil),
+                       }),
+                       avro.WithDoc("Added rows count"),
+                       avro.WithProps(map[string]any{"field-id": 512}),
+               ),
+               MustNewField(
+                       "existing_rows_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Long, nil),
+                       }),
+                       avro.WithDoc("Existing rows count"),
+                       avro.WithProps(map[string]any{"field-id": 513}),
+               ),
+               MustNewField(
+                       "deleted_rows_count",
+                       MustNewUnionSchema([]avro.Schema{
+                               avro.NewNullSchema(),
+                               avro.NewPrimitiveSchema(avro.Long, nil),
+                       }),
+                       avro.WithDoc("Deleted rows count"),
+                       avro.WithProps(map[string]any{"field-id": 514}),
+               ),
+       })
+}
+
+func MustNewPartitionSchema() avro.Schema {

Review Comment:
   again, can this static schema be cached rather than built on demand



##########
manifest.go:
##########
@@ -292,10 +307,15 @@ func (b *ManifestV2Builder) KeyMetadata(km []byte) 
*ManifestV2Builder {
 // a pointer to the constructed manifest file. Further calls to the modifier
 // methods after calling build would modify the constructed ManifestFile.
 func (b *ManifestV2Builder) Build() ManifestFile {
+       if b.m.partitionType == nil {
+               b.m.partitionType = UnpartitionedSpec.PartitionType(nil)
+       }
+
        return b.m
 }
 
 type manifestFileV2 struct {
+       partitionType      *StructType

Review Comment:
   In either case, we should probably be populating this after reading in a 
manifest file too, right?



##########
manifest.go:
##########
@@ -292,10 +307,15 @@ func (b *ManifestV2Builder) KeyMetadata(km []byte) 
*ManifestV2Builder {
 // a pointer to the constructed manifest file. Further calls to the modifier
 // methods after calling build would modify the constructed ManifestFile.
 func (b *ManifestV2Builder) Build() ManifestFile {
+       if b.m.partitionType == nil {
+               b.m.partitionType = UnpartitionedSpec.PartitionType(nil)
+       }
+
        return b.m
 }
 
 type manifestFileV2 struct {
+       partitionType      *StructType

Review Comment:
   Could this be integrated with the `FieldSummary` instead of being at the top 
level here?



##########
internal/manifest_list_v2.go:
##########
@@ -0,0 +1,100 @@
+package internal
+
+import (
+       "github.com/hamba/avro/v2"
+)
+
+func MustNewManifestListV2Schema() avro.Schema {

Review Comment:
   since this is a static schema, can we cache it rather than building it on 
demand?



##########
internal/manifest-entry-v1.json:
##########
@@ -0,0 +1,182 @@
+{

Review Comment:
   where does this json file get used?



##########
utils.go:
##########
@@ -238,3 +239,68 @@ func avroEncode[T any](key string, version int, vals []T, 
out io.Writer) error {
 
        return enc.Close()
 }
+
+func structTypeToAvroPartitionSchema(st *StructType) (avro.Schema, error) {
+       var aFields []*avro.Field
+
+       for _, field := range st.Fields() {
+               aField, err := nestedFieldToAvroField(field)
+
+               if err != nil {
+                       return nil, err
+               }
+
+               aFields = append(aFields, aField)
+       }
+
+       return avro.NewRecordSchema("r102", "", aFields)

Review Comment:
   should this be "r102"? Or should this be more dynamic?



##########
internal/manifest_list_v1.go:
##########
@@ -0,0 +1,142 @@
+package internal
+
+import (
+       "github.com/hamba/avro/v2"
+)
+
+func MustNewManifestListV1Schema() avro.Schema {

Review Comment:
   same as other comments



##########
manifest.go:
##########
@@ -613,31 +633,37 @@ func WriteManifestList(out io.Writer, files 
[]ManifestFile) error {
                }
        }
 
-       var key string
+       var sch avro.Schema
        switch version {
        case 1:
-               key = internal.ManifestListV1Key
+               sch = internal.MustNewManifestListV1Schema()
        case 2:
-               key = internal.ManifestListV2Key
+               sch = internal.MustNewManifestListV2Schema()
        default:
                return fmt.Errorf("%w: non-recognized version %d", 
ErrInvalidArgument, version)
        }

Review Comment:
   for the ManifestList schema and others that don't need the dynamic part, 
it's better to have a cache of the schemas than to build it from scratch each 
time.



##########
internal/utils.go:
##########
@@ -0,0 +1,35 @@
+package internal
+
+import (
+       "github.com/hamba/avro/v2"
+)
+
+func MustNewUnionSchema(types []avro.Schema, opts ...avro.SchemaOption) 
*avro.UnionSchema {
+       s, err := avro.NewUnionSchema(types, opts...)
+
+       if err != nil {
+               panic(err)
+       }
+
+       return s
+}

Review Comment:
   these can all be simplified with a simple
   
   ```go
   func Must[T any](v T, err error) T {
       if err != nil {
           panic(err)
       }
       return v
   }
   ```
   
   and then just using that generic method around the rest of the calls 
instead. Honestly, we probably can just have that utility function and not 
bother having these at all, just calling `Must(avro.NewUnionSchema(.....))` etc.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to