[GitHub] [iceberg] Fokko commented on a diff in pull request #8122: Go: Schema and Types

via GitHub Sat, 22 Jul 2023 00:14:29 -0700


Fokko commented on code in PR #8122:
URL: https://github.com/apache/iceberg/pull/8122#discussion_r1271255854



##########
go/iceberg/schema.go:
##########
@@ -0,0 +1,849 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding/json"
+       "fmt"
+       "strings"
+       "sync/atomic"
+
+       "golang.org/x/exp/maps"
+       "golang.org/x/exp/slices"
+)
+
+// Schema is an Iceberg table schema, represented as a struct with
+// multiple fields. The fields are only exported via accessor methods
+// rather than exposing the slice directly in order to ensure a schema
+// as immutable.
+type Schema struct {
+       ID                 int   `json:"schema-id"`
+       IdentifierFieldIDs []int `json:"identifier-field-ids"`
+
+       fields []NestedField
+
+       // the following maps are lazily populated as needed.
+       // rather than have lock contention with a mutex, we can use
+       // atomic pointers to Store/Load the values.
+       idToName      atomic.Pointer[map[int]string]
+       idToField     atomic.Pointer[map[int]NestedField]
+       nameToID      atomic.Pointer[map[string]int]
+       nameToIDLower atomic.Pointer[map[string]int]
+}
+
+// NewSchema constructs a new schema with the provided ID
+// and list of fields.
+func NewSchema(id int, fields ...NestedField) *Schema {
+       return NewSchemaWithIdentifiers(id, []int{}, fields...)
+}
+
+// NewSchemaWithIdentifiers constructs a new schema with the provided ID
+// and fields, along with a slice of field IDs to be listed as identifier
+// fields.
+func NewSchemaWithIdentifiers(id int, identifierIDs []int, fields 
...NestedField) *Schema {
+       return &Schema{ID: id, fields: fields, IdentifierFieldIDs: 
identifierIDs}
+}
+
+func (s *Schema) lazyNameToID() (map[string]int, error) {
+       index := s.nameToID.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByName(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.nameToID.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToField() (map[int]NestedField, error) {
+       index := s.idToField.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToField.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToName() (map[int]string, error) {
+       index := s.idToName.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexNameByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToName.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyNameToIDLower() (map[string]int, error) {
+       index := s.nameToIDLower.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := s.lazyNameToID()
+       if err != nil {
+               return nil, err
+       }
+
+       out := make(map[string]int)
+       for k, v := range idx {
+               out[strings.ToLower(k)] = v
+       }
+
+       s.nameToIDLower.Store(&out)
+       return out, nil
+}
+
+func (s *Schema) Type() string { return "struct" }
+
+// AsStruct returns a Struct with the same fields as the schema which can
+// then be used as a Type.
+func (s *Schema) AsStruct() StructType    { return StructType{Fields: 
s.fields} }
+func (s *Schema) NumFields() int          { return len(s.fields) }
+func (s *Schema) Field(i int) NestedField { return s.fields[i] }
+func (s *Schema) Children() []NestedField { return slices.Clone(s.fields) }
+
+func (s *Schema) UnmarshalJSON(b []byte) error {
+       type Alias Schema
+       aux := struct {
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Alias: (*Alias)(s)}
+
+       if err := json.Unmarshal(b, &aux); err != nil {
+               return err
+       }
+
+       s.fields = aux.Fields
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+       return nil
+}
+
+func (s *Schema) MarshalJSON() ([]byte, error) {
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+
+       type Alias Schema
+       return json.Marshal(struct {
+               Type   string        `json:"type"`
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Type: "struct", Fields: s.fields, Alias: (*Alias)(s)})
+}
+
+// FindColumnName returns the name of the column identified by the
+// passed in field id. The second return value reports whether or
+// not the field id was found in the schema.
+func (s *Schema) FindColumnName(fieldID int) (string, bool) {
+       idx, _ := s.lazyIDToName()
+       col, ok := idx[fieldID]
+       return col, ok
+}
+
+// FindFieldByName returns the field identified by the name given,
+// the second return value will be false if no field by this name
+// is found.
+//
+// Note: This search is done in a case sensitive manner. To perform
+// a case insensitive search, use [*Schema.FindFieldByNameCaseInsensitive].
+func (s *Schema) FindFieldByName(name string) (NestedField, bool) {
+       idx, _ := s.lazyNameToID()
+
+       id, ok := idx[name]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByNameCaseInsensitive is like [*Schema.FindFieldByName],
+// but performs a case insensitive search.
+func (s *Schema) FindFieldByNameCaseInsensitive(name string) (NestedField, 
bool) {
+       idx, _ := s.lazyNameToIDLower()
+
+       id, ok := idx[strings.ToLower(name)]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByID is like [*Schema.FindColumnByName], but returns the whole
+// field rather than just the field name.
+func (s *Schema) FindFieldByID(id int) (NestedField, bool) {
+       idx, _ := s.lazyIDToField()
+       f, ok := idx[id]
+       return f, ok
+}
+
+// FindTypeByID is like [*Schema.FindFieldByID], but returns only the data
+// type of the field.
+func (s *Schema) FindTypeByID(id int) (Type, bool) {
+       f, ok := s.FindFieldByID(id)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByName is a convenience function for calling 
[*Schema.FindFieldByName],
+// and then returning just the type.
+func (s *Schema) FindTypeByName(name string) (Type, bool) {
+       f, ok := s.FindFieldByName(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByNameCaseInsensitive is like [*Schema.FindTypeByName] but
+// performs a case insensitive search.
+func (s *Schema) FindTypeByNameCaseInsensitive(name string) (Type, bool) {
+       f, ok := s.FindFieldByNameCaseInsensitive(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// Equals compares the fields and identifierIDs, but does not compare
+// the schema ID itself.
+func (s *Schema) Equals(other *Schema) bool {
+       if other == nil {
+               return false
+       }
+
+       if s == other {
+               return true
+       }
+
+       if len(s.fields) != len(other.fields) {
+               return false
+       }
+
+       if !slices.Equal(s.IdentifierFieldIDs, other.IdentifierFieldIDs) {
+               return false
+       }
+
+       return slices.EqualFunc(s.fields, other.fields, func(a, b NestedField) 
bool {

Review Comment:
   Just curious. In Python we use a generator that will break the loop once it 
encounters the first `False`:
   ```python
   schema_is_equal = all(lhs == rhs for lhs, rhs in zip(self.columns, 
other.columns))
   ```
   Is there something similar in Go?



##########
go/iceberg/schema_test.go:
##########
@@ -0,0 +1,633 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+       "encoding/json"
+       "strings"
+       "testing"
+
+       "github.com/apache/iceberg/go/iceberg"
+
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+var (
+       tableSchemaNested = iceberg.NewSchemaWithIdentifiers(1,
+               []int{1},
+               iceberg.NestedField{
+                       ID: 1, Name: "foo", Type: 
iceberg.PrimitiveTypes.String, Required: false},
+               iceberg.NestedField{
+                       ID: 2, Name: "bar", Type: iceberg.PrimitiveTypes.Int32, 
Required: true},
+               iceberg.NestedField{
+                       ID: 3, Name: "baz", Type: iceberg.PrimitiveTypes.Bool, 
Required: false},
+               iceberg.NestedField{
+                       ID: 4, Name: "qux", Required: true, Type: 
&iceberg.ListType{
+                               ElementID: 5, Element: 
iceberg.PrimitiveTypes.String, ElementRequired: true}},
+               iceberg.NestedField{
+                       ID: 6, Name: "quux",
+                       Type: &iceberg.MapType{
+                               KeyID:   7,
+                               KeyType: iceberg.PrimitiveTypes.String,
+                               ValueID: 8,
+                               ValueType: &iceberg.MapType{
+                                       KeyID:         9,
+                                       KeyType:       
iceberg.PrimitiveTypes.String,
+                                       ValueID:       10,
+                                       ValueType:     
iceberg.PrimitiveTypes.Int32,
+                                       ValueRequired: true,
+                               },
+                               ValueRequired: true,
+                       },
+                       Required: true},
+               iceberg.NestedField{
+                       ID: 11, Name: "location", Type: &iceberg.ListType{
+                               ElementID: 12, Element: &iceberg.StructType{
+                                       Fields: []iceberg.NestedField{
+                                               {ID: 13, Name: "latitude", 
Type: iceberg.PrimitiveTypes.Float32, Required: false},
+                                               {ID: 14, Name: "longitude", 
Type: iceberg.PrimitiveTypes.Float32, Required: false},
+                                       },
+                               },
+                               ElementRequired: true},
+                       Required: true},
+               iceberg.NestedField{
+                       ID:   15,
+                       Name: "person",
+                       Type: &iceberg.StructType{
+                               Fields: []iceberg.NestedField{
+                                       {ID: 16, Name: "name", Type: 
iceberg.PrimitiveTypes.String, Required: false},
+                                       {ID: 17, Name: "age", Type: 
iceberg.PrimitiveTypes.Int32, Required: true},
+                               },
+                       },
+                       Required: false,
+               },
+       )
+
+       tableSchemaSimple = iceberg.NewSchemaWithIdentifiers(1,
+               []int{2},
+               iceberg.NestedField{ID: 1, Name: "foo", Type: 
iceberg.PrimitiveTypes.String},
+               iceberg.NestedField{ID: 2, Name: "bar", Type: 
iceberg.PrimitiveTypes.Int32, Required: true},
+               iceberg.NestedField{ID: 3, Name: "baz", Type: 
iceberg.PrimitiveTypes.Bool},
+       )
+)
+
+func TestNestedFieldToString(t *testing.T) {
+       tests := []struct {
+               idx      int
+               expected string
+       }{
+               {0, "1: foo: optional string"},
+               {1, "2: bar: required int"},
+               {2, "3: baz: optional boolean"},
+               {3, "4: qux: required list<string>"},
+               {4, "6: quux: required map<string, map<string, int>>"},
+               {5, "11: location: required list<struct<latitude: float, 
longitude: float>>"},
+               {6, "15: person: optional struct<name: string, age: int>"},
+       }
+
+       for _, tt := range tests {
+               assert.Equal(t, tt.expected, 
tableSchemaNested.Field(tt.idx).String())
+       }
+}
+
+func TestSchemaIndexByIDVisitor(t *testing.T) {
+       index, err := iceberg.IndexByID(tableSchemaNested)
+       require.NoError(t, err)
+
+       assert.Equal(t, map[int]iceberg.NestedField{
+               1: tableSchemaNested.Field(0),
+               2: tableSchemaNested.Field(1),
+               3: tableSchemaNested.Field(2),
+               4: tableSchemaNested.Field(3),
+               5: {ID: 5, Name: "element", Type: 
iceberg.PrimitiveTypes.String, Required: true},
+               6: tableSchemaNested.Field(4),
+               7: {ID: 7, Name: "key", Type: iceberg.PrimitiveTypes.String, 
Required: true},
+               8: {ID: 8, Name: "value", Type: &iceberg.MapType{
+                       KeyID:         9,
+                       KeyType:       iceberg.PrimitiveTypes.String,
+                       ValueID:       10,
+                       ValueType:     iceberg.PrimitiveTypes.Int32,
+                       ValueRequired: true,
+               }, Required: true},
+               9:  {ID: 9, Name: "key", Type: iceberg.PrimitiveTypes.String, 
Required: true},
+               10: {ID: 10, Name: "value", Type: iceberg.PrimitiveTypes.Int32, 
Required: true},
+               11: tableSchemaNested.Field(5),

Review Comment:
   👍🏻 



##########
go/iceberg/schema.go:
##########
@@ -0,0 +1,849 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding/json"
+       "fmt"
+       "strings"
+       "sync/atomic"
+
+       "golang.org/x/exp/maps"
+       "golang.org/x/exp/slices"
+)
+
+// Schema is an Iceberg table schema, represented as a struct with
+// multiple fields. The fields are only exported via accessor methods
+// rather than exposing the slice directly in order to ensure a schema
+// as immutable.
+type Schema struct {
+       ID                 int   `json:"schema-id"`
+       IdentifierFieldIDs []int `json:"identifier-field-ids"`
+
+       fields []NestedField
+
+       // the following maps are lazily populated as needed.
+       // rather than have lock contention with a mutex, we can use
+       // atomic pointers to Store/Load the values.
+       idToName      atomic.Pointer[map[int]string]
+       idToField     atomic.Pointer[map[int]NestedField]
+       nameToID      atomic.Pointer[map[string]int]
+       nameToIDLower atomic.Pointer[map[string]int]
+}
+
+// NewSchema constructs a new schema with the provided ID
+// and list of fields.
+func NewSchema(id int, fields ...NestedField) *Schema {
+       return NewSchemaWithIdentifiers(id, []int{}, fields...)
+}
+
+// NewSchemaWithIdentifiers constructs a new schema with the provided ID
+// and fields, along with a slice of field IDs to be listed as identifier
+// fields.
+func NewSchemaWithIdentifiers(id int, identifierIDs []int, fields 
...NestedField) *Schema {
+       return &Schema{ID: id, fields: fields, IdentifierFieldIDs: 
identifierIDs}
+}
+
+func (s *Schema) lazyNameToID() (map[string]int, error) {
+       index := s.nameToID.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByName(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.nameToID.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToField() (map[int]NestedField, error) {
+       index := s.idToField.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToField.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyIDToName() (map[int]string, error) {
+       index := s.idToName.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := IndexNameByID(s)
+       if err != nil {
+               return nil, err
+       }
+
+       s.idToName.Store(&idx)
+       return idx, nil
+}
+
+func (s *Schema) lazyNameToIDLower() (map[string]int, error) {
+       index := s.nameToIDLower.Load()
+       if index != nil {
+               return *index, nil
+       }
+
+       idx, err := s.lazyNameToID()
+       if err != nil {
+               return nil, err
+       }
+
+       out := make(map[string]int)
+       for k, v := range idx {
+               out[strings.ToLower(k)] = v
+       }
+
+       s.nameToIDLower.Store(&out)
+       return out, nil
+}
+
+func (s *Schema) Type() string { return "struct" }
+
+// AsStruct returns a Struct with the same fields as the schema which can
+// then be used as a Type.
+func (s *Schema) AsStruct() StructType    { return StructType{Fields: 
s.fields} }
+func (s *Schema) NumFields() int          { return len(s.fields) }
+func (s *Schema) Field(i int) NestedField { return s.fields[i] }
+func (s *Schema) Children() []NestedField { return slices.Clone(s.fields) }
+
+func (s *Schema) UnmarshalJSON(b []byte) error {
+       type Alias Schema
+       aux := struct {
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Alias: (*Alias)(s)}
+
+       if err := json.Unmarshal(b, &aux); err != nil {
+               return err
+       }
+
+       s.fields = aux.Fields
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+       return nil
+}
+
+func (s *Schema) MarshalJSON() ([]byte, error) {
+       if s.IdentifierFieldIDs == nil {
+               s.IdentifierFieldIDs = []int{}
+       }
+
+       type Alias Schema
+       return json.Marshal(struct {
+               Type   string        `json:"type"`
+               Fields []NestedField `json:"fields"`
+               *Alias
+       }{Type: "struct", Fields: s.fields, Alias: (*Alias)(s)})
+}
+
+// FindColumnName returns the name of the column identified by the
+// passed in field id. The second return value reports whether or
+// not the field id was found in the schema.
+func (s *Schema) FindColumnName(fieldID int) (string, bool) {
+       idx, _ := s.lazyIDToName()
+       col, ok := idx[fieldID]
+       return col, ok
+}
+
+// FindFieldByName returns the field identified by the name given,
+// the second return value will be false if no field by this name
+// is found.
+//
+// Note: This search is done in a case sensitive manner. To perform
+// a case insensitive search, use [*Schema.FindFieldByNameCaseInsensitive].
+func (s *Schema) FindFieldByName(name string) (NestedField, bool) {
+       idx, _ := s.lazyNameToID()
+
+       id, ok := idx[name]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByNameCaseInsensitive is like [*Schema.FindFieldByName],
+// but performs a case insensitive search.
+func (s *Schema) FindFieldByNameCaseInsensitive(name string) (NestedField, 
bool) {
+       idx, _ := s.lazyNameToIDLower()
+
+       id, ok := idx[strings.ToLower(name)]
+       if !ok {
+               return NestedField{}, false
+       }
+
+       return s.FindFieldByID(id)
+}
+
+// FindFieldByID is like [*Schema.FindColumnByName], but returns the whole
+// field rather than just the field name.
+func (s *Schema) FindFieldByID(id int) (NestedField, bool) {
+       idx, _ := s.lazyIDToField()
+       f, ok := idx[id]
+       return f, ok
+}
+
+// FindTypeByID is like [*Schema.FindFieldByID], but returns only the data
+// type of the field.
+func (s *Schema) FindTypeByID(id int) (Type, bool) {
+       f, ok := s.FindFieldByID(id)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByName is a convenience function for calling 
[*Schema.FindFieldByName],
+// and then returning just the type.
+func (s *Schema) FindTypeByName(name string) (Type, bool) {
+       f, ok := s.FindFieldByName(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// FindTypeByNameCaseInsensitive is like [*Schema.FindTypeByName] but
+// performs a case insensitive search.
+func (s *Schema) FindTypeByNameCaseInsensitive(name string) (Type, bool) {
+       f, ok := s.FindFieldByNameCaseInsensitive(name)
+       if !ok {
+               return nil, false
+       }
+
+       return f.Type, true
+}
+
+// Equals compares the fields and identifierIDs, but does not compare
+// the schema ID itself.
+func (s *Schema) Equals(other *Schema) bool {
+       if other == nil {
+               return false
+       }
+
+       if s == other {
+               return true
+       }
+
+       if len(s.fields) != len(other.fields) {
+               return false
+       }
+
+       if !slices.Equal(s.IdentifierFieldIDs, other.IdentifierFieldIDs) {
+               return false
+       }
+
+       return slices.EqualFunc(s.fields, other.fields, func(a, b NestedField) 
bool {
+               return a.Equals(b)
+       })
+}
+
+// HighestFieldID returns the value of the numerically highest field ID
+// in this schema.
+func (s *Schema) HighestFieldID() int {
+       id, _ := Visit[int](s, findLastFieldID{})
+       return id
+}
+
+type Void = struct{}
+
+var void = Void{}
+
+// Select creates a new schema with just the fields identified by name
+// passed in the order they are provided. If caseSensitive is false,
+// then fields will be identified by case insensitive search.
+//
+// An error is returned if a requested name cannot be found.
+func (s *Schema) Select(caseSensitive bool, names ...string) (*Schema, error) {
+       ids := make(map[int]Void)
+       if caseSensitive {
+               nameMap, _ := s.lazyNameToID()
+               for _, n := range names {
+                       id, ok := nameMap[n]
+                       if !ok {
+                               return nil, fmt.Errorf("%w: could not find 
column %s", ErrInvalidSchema, n)
+                       }
+                       ids[id] = void
+               }
+       } else {
+               nameMap, _ := s.lazyNameToIDLower()
+               for _, n := range names {
+                       id, ok := nameMap[strings.ToLower(n)]
+                       if !ok {
+                               return nil, fmt.Errorf("%w: could not find 
column %s", ErrInvalidSchema, n)
+                       }
+                       ids[id] = void
+               }
+       }
+
+       return PruneColumns(s, ids, true)
+}
+
+// SchemaVisitor is an interface that can be implemented to allow for
+// easy traversal and processing of a schema.
+//
+// A SchemaVisitor can also optionally implement the Before/After Field,
+// ListElement, MapKey, or MapValue interfaces to allow them to get called
+// at the appropriate points within schema traversal.
+type SchemaVisitor[T any] interface {
+       Schema(schema *Schema, structResult T) T
+       Struct(st StructType, fieldResults []T) T
+       Field(field NestedField, fieldResult T) T
+       List(list ListType, elemResult T) T
+       Map(mapType MapType, keyResult, valueResult T) T
+       Primitive(p PrimitiveType) T
+}
+
+type BeforeFieldVisitor interface {
+       BeforeField(field NestedField)
+}
+
+type AfterFieldVisitor interface {
+       AfterField(field NestedField)
+}
+
+type BeforeListElementVisitor interface {
+       BeforeListElement(elem NestedField)
+}
+
+type AfterListElementVisitor interface {
+       AfterListElement(elem NestedField)
+}
+
+type BeforeMapKeyVisitor interface {
+       BeforeMapKey(key NestedField)
+}
+
+type AfterMapKeyVisitor interface {
+       AfterMapKey(key NestedField)
+}
+
+type BeforeMapValueVisitor interface {
+       BeforeMapValue(value NestedField)
+}
+
+type AfterMapValueVisitor interface {
+       AfterMapValue(value NestedField)
+}
+
+// Visit accepts a visitor and performs a post-order traversal of the given 
schema.
+func Visit[T any](sc *Schema, visitor SchemaVisitor[T]) (res T, err error) {
+       if sc == nil {
+               err = fmt.Errorf("%w: cannot visit nil schema", 
ErrInvalidArgument)
+               return
+       }
+
+       defer func() {
+               if r := recover(); r != nil {
+                       switch e := r.(type) {
+                       case string:
+                               err = fmt.Errorf("error encountered during 
schema visitor: %s", e)
+                       case error:
+                               err = fmt.Errorf("error encountered during 
schema visitor: %w", e)
+                       }
+               }
+       }()
+
+       return visitor.Schema(sc, visitStruct(sc.AsStruct(), visitor)), nil
+}
+
+func visitStruct[T any](obj StructType, visitor SchemaVisitor[T]) T {
+       results := make([]T, len(obj.Fields))
+
+       bf, _ := visitor.(BeforeFieldVisitor)
+       af, _ := visitor.(AfterFieldVisitor)
+
+       for i, f := range obj.Fields {
+               if bf != nil {
+                       bf.BeforeField(f)
+               }
+
+               res := visitField(f, visitor)
+
+               if af != nil {
+                       af.AfterField(f)
+               }
+
+               results[i] = visitor.Field(f, res)
+       }
+
+       return visitor.Struct(obj, results)
+}
+
+func visitList[T any](obj ListType, visitor SchemaVisitor[T]) T {
+       elemField := obj.ElementField()
+
+       if bl, ok := visitor.(BeforeListElementVisitor); ok {
+               bl.BeforeListElement(elemField)
+       } else if bf, ok := visitor.(BeforeFieldVisitor); ok {
+               bf.BeforeField(elemField)

Review Comment:
   I think we only want to visit the first branch of this if:
   ```python
   @visit.register(ListType)
   def _(obj: ListType, visitor: SchemaVisitor[T]) -> T:
       """Visit a ListType with a concrete SchemaVisitor."""
       visitor.before_list_element(obj.element_field)
       result = visit(obj.element_type, visitor)
       visitor.after_list_element(obj.element_field)
   
       return visitor.list(obj, result)
   ```



##########
go/iceberg/schema_test.go:
##########
@@ -0,0 +1,633 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+       "encoding/json"
+       "strings"
+       "testing"
+
+       "github.com/apache/iceberg/go/iceberg"
+
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+var (
+       tableSchemaNested = iceberg.NewSchemaWithIdentifiers(1,
+               []int{1},
+               iceberg.NestedField{
+                       ID: 1, Name: "foo", Type: 
iceberg.PrimitiveTypes.String, Required: false},
+               iceberg.NestedField{
+                       ID: 2, Name: "bar", Type: iceberg.PrimitiveTypes.Int32, 
Required: true},
+               iceberg.NestedField{
+                       ID: 3, Name: "baz", Type: iceberg.PrimitiveTypes.Bool, 
Required: false},
+               iceberg.NestedField{
+                       ID: 4, Name: "qux", Required: true, Type: 
&iceberg.ListType{
+                               ElementID: 5, Element: 
iceberg.PrimitiveTypes.String, ElementRequired: true}},
+               iceberg.NestedField{
+                       ID: 6, Name: "quux",
+                       Type: &iceberg.MapType{
+                               KeyID:   7,
+                               KeyType: iceberg.PrimitiveTypes.String,
+                               ValueID: 8,
+                               ValueType: &iceberg.MapType{
+                                       KeyID:         9,
+                                       KeyType:       
iceberg.PrimitiveTypes.String,
+                                       ValueID:       10,
+                                       ValueType:     
iceberg.PrimitiveTypes.Int32,
+                                       ValueRequired: true,
+                               },
+                               ValueRequired: true,
+                       },
+                       Required: true},
+               iceberg.NestedField{
+                       ID: 11, Name: "location", Type: &iceberg.ListType{
+                               ElementID: 12, Element: &iceberg.StructType{
+                                       Fields: []iceberg.NestedField{
+                                               {ID: 13, Name: "latitude", 
Type: iceberg.PrimitiveTypes.Float32, Required: false},
+                                               {ID: 14, Name: "longitude", 
Type: iceberg.PrimitiveTypes.Float32, Required: false},
+                                       },
+                               },
+                               ElementRequired: true},
+                       Required: true},
+               iceberg.NestedField{
+                       ID:   15,
+                       Name: "person",
+                       Type: &iceberg.StructType{
+                               Fields: []iceberg.NestedField{
+                                       {ID: 16, Name: "name", Type: 
iceberg.PrimitiveTypes.String, Required: false},
+                                       {ID: 17, Name: "age", Type: 
iceberg.PrimitiveTypes.Int32, Required: true},
+                               },
+                       },
+                       Required: false,
+               },
+       )
+
+       tableSchemaSimple = iceberg.NewSchemaWithIdentifiers(1,
+               []int{2},
+               iceberg.NestedField{ID: 1, Name: "foo", Type: 
iceberg.PrimitiveTypes.String},
+               iceberg.NestedField{ID: 2, Name: "bar", Type: 
iceberg.PrimitiveTypes.Int32, Required: true},
+               iceberg.NestedField{ID: 3, Name: "baz", Type: 
iceberg.PrimitiveTypes.Bool},
+       )
+)
+
+func TestNestedFieldToString(t *testing.T) {
+       tests := []struct {
+               idx      int
+               expected string
+       }{
+               {0, "1: foo: optional string"},
+               {1, "2: bar: required int"},
+               {2, "3: baz: optional boolean"},
+               {3, "4: qux: required list<string>"},
+               {4, "6: quux: required map<string, map<string, int>>"},
+               {5, "11: location: required list<struct<latitude: float, 
longitude: float>>"},
+               {6, "15: person: optional struct<name: string, age: int>"},
+       }
+
+       for _, tt := range tests {
+               assert.Equal(t, tt.expected, 
tableSchemaNested.Field(tt.idx).String())
+       }
+}
+
+func TestSchemaIndexByIDVisitor(t *testing.T) {
+       index, err := iceberg.IndexByID(tableSchemaNested)
+       require.NoError(t, err)
+
+       assert.Equal(t, map[int]iceberg.NestedField{
+               1: tableSchemaNested.Field(0),
+               2: tableSchemaNested.Field(1),
+               3: tableSchemaNested.Field(2),
+               4: tableSchemaNested.Field(3),
+               5: {ID: 5, Name: "element", Type: 
iceberg.PrimitiveTypes.String, Required: true},
+               6: tableSchemaNested.Field(4),
+               7: {ID: 7, Name: "key", Type: iceberg.PrimitiveTypes.String, 
Required: true},
+               8: {ID: 8, Name: "value", Type: &iceberg.MapType{
+                       KeyID:         9,
+                       KeyType:       iceberg.PrimitiveTypes.String,
+                       ValueID:       10,
+                       ValueType:     iceberg.PrimitiveTypes.Int32,
+                       ValueRequired: true,
+               }, Required: true},
+               9:  {ID: 9, Name: "key", Type: iceberg.PrimitiveTypes.String, 
Required: true},
+               10: {ID: 10, Name: "value", Type: iceberg.PrimitiveTypes.Int32, 
Required: true},
+               11: tableSchemaNested.Field(5),

Review Comment:
   👍🏻 



##########
go/iceberg/schema_test.go:
##########
@@ -0,0 +1,633 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg_test
+
+import (
+       "encoding/json"
+       "strings"
+       "testing"
+
+       "github.com/apache/iceberg/go/iceberg"
+
+       "github.com/stretchr/testify/assert"
+       "github.com/stretchr/testify/require"
+)
+
+var (
+       tableSchemaNested = iceberg.NewSchemaWithIdentifiers(1,

Review Comment:
   Nice, thanks for using the same tests :)



##########
go/iceberg/schema.go:
##########
@@ -0,0 +1,849 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+package iceberg
+
+import (
+       "encoding/json"
+       "fmt"
+       "strings"
+       "sync/atomic"
+
+       "golang.org/x/exp/maps"
+       "golang.org/x/exp/slices"
+)
+
+// Schema is an Iceberg table schema, represented as a struct with
+// multiple fields. The fields are only exported via accessor methods
+// rather than exposing the slice directly in order to ensure a schema
+// as immutable.
+type Schema struct {

Review Comment:
   Can you also add the `toString` method for the schema:
   
   ```python
       def __str__(self) -> str:
           """Returns the string representation of the Schema class."""
           return "table {\n" + "\n".join(["  " + str(field) for field in 
self.columns]) + "\n}"
   ```
   
   I think something like:
   ```go
   func (s *StructType) String() string {
        var b strings.Builder
        b.WriteString("table {\n")
        for i, f := range s.fields {
                if i != 0 {
                        b.WriteString("\n")
                }
                b.WriteString("    ")
                b.WriteString(f)
        }
        b.WriteString("\n}")
   
        return b.String()
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #8122: Go: Schema and Types

Reply via email to