jhump commented on code in PR #417:
URL: https://github.com/apache/iceberg-go/pull/417#discussion_r2075959498


##########
manifest.go:
##########
@@ -614,6 +559,211 @@ func decodeManifests[I interface {
        return results, dec.Error()
 }
 
+// ManifestReader reads the metadata and data from an avro manifest file.
+type ManifestReader struct {
+       dec           *ocf.Decoder
+       file          ManifestFile
+       formatVersion int
+       isFallback    bool
+       content       ManifestContent
+       fieldNameToID map[string]int
+       fieldIDToType map[int]avro.LogicalType
+
+       // The rest are lazily populated, on demand. Most readers
+       // will likely only try to load the entries.
+       schema              Schema
+       schemaLoaded        bool
+       partitionSpec       PartitionSpec
+       partitionSpecLoaded bool
+}
+
+// NewManifestReader returns a value that can read the contents of an avro 
manifest
+// file. If the caller is interested in the manifest entries in the file, it 
must call
+// [ManifestReader.Entries] before closing the provided reader.
+func NewManifestReader(file ManifestFile, in io.Reader) (*ManifestReader, 
error) {
+       dec, err := ocf.NewDecoder(in, 
ocf.WithDecoderSchemaCache(&avro.SchemaCache{}))
+       if err != nil {
+               return nil, err
+       }
+
+       metadata := dec.Metadata()
+       sc := dec.Schema()
+
+       formatVersion, err := strconv.Atoi(string(metadata["format-version"]))
+       if err != nil {
+               return nil, fmt.Errorf("manifest file's 'format-version' 
metadata is invalid: %w", err)
+       }
+       if formatVersion != file.Version() {
+               return nil, fmt.Errorf("manifest file's 'format-version' 
metadata indicates version %d, but entry from manifest list indicates version 
%d",
+                       formatVersion, file.Version())
+       }
+
+       var content ManifestContent
+       switch contentStr := string(metadata["content"]); contentStr {
+       case "data":
+               content = ManifestContentData
+       case "deletes":
+               content = ManifestContentDeletes
+       default:
+               return nil, fmt.Errorf("manifest file's 'content' metadata is 
invalid, should be \"data\" or \"deletes\" but instead is %q",
+                       contentStr)
+       }
+       if content != file.ManifestContent() {
+               return nil, fmt.Errorf("manifest file's 'content' metadata 
indicates %q, but entry from manifest list indicates %q",
+                       content.String(), file.ManifestContent().String())
+       }
+
+       isFallback := false
+       if formatVersion == 1 {
+               for _, f := range sc.(*avro.RecordSchema).Fields() {
+                       if f.Name() == "snapshot_id" {
+                               if f.Type().Type() != avro.Union {
+                                       isFallback = true
+                               }
+
+                               break
+                       }
+               }
+       }
+       fieldNameToID, fieldIDToType := getFieldIDMap(sc)
+
+       return &ManifestReader{
+               dec:           dec,
+               file:          file,
+               formatVersion: formatVersion,
+               isFallback:    isFallback,
+               content:       content,
+               fieldNameToID: fieldNameToID,
+               fieldIDToType: fieldIDToType,
+       }, nil
+}
+
+// Version returns the file's format version.
+func (c *ManifestReader) Version() int {
+       return c.formatVersion
+}
+
+// ManifestContent returns the type of content in the manifest file.
+func (c *ManifestReader) ManifestContent() ManifestContent {
+       return c.content
+}
+
+// SchemaID returns the schema ID encoded in the avro file's metadata.
+func (c *ManifestReader) SchemaID() (int, error) {
+       id, err := strconv.Atoi(string(c.dec.Metadata()["schema-id"]))
+       if err != nil {
+               return 0, fmt.Errorf("manifest file's 'schema-id' metadata is 
invalid: %w", err)
+       }
+
+       return id, nil
+}
+
+// Schema returns the schema encoded in the avro file's metadata.
+func (c *ManifestReader) Schema() (*Schema, error) {
+       if !c.schemaLoaded {
+               schemaID, err := c.SchemaID()
+               if err != nil {
+                       return nil, err
+               }
+               if err := json.Unmarshal(c.dec.Metadata()["schema"], 
&c.schema); err != nil {
+                       return nil, fmt.Errorf("manifest file's 'schema' 
metadata is invalid: %w", err)
+               }
+               c.schema.ID = schemaID
+               c.schemaLoaded = true
+       }

Review Comment:
   That is a very good question. I am not used to "readers" necessarily being 
thread-safe, so my inclination would be to not change it. But that does suggest 
I should update the docs to be very clear about that.
   
   I'm happy to make it thread-safe, if you think that would be a better DX and 
less likely for users to get themselves in trouble.
   
   



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to