zeroshade commented on code in PR #417:
URL: https://github.com/apache/iceberg-go/pull/417#discussion_r2075982283


##########
manifest.go:
##########
@@ -614,6 +559,211 @@ func decodeManifests[I interface {
        return results, dec.Error()
 }
 
+// ManifestReader reads the metadata and data from an avro manifest file.
+type ManifestReader struct {
+       dec           *ocf.Decoder
+       file          ManifestFile
+       formatVersion int
+       isFallback    bool
+       content       ManifestContent
+       fieldNameToID map[string]int
+       fieldIDToType map[int]avro.LogicalType
+
+       // The rest are lazily populated, on demand. Most readers
+       // will likely only try to load the entries.
+       schema              Schema
+       schemaLoaded        bool
+       partitionSpec       PartitionSpec
+       partitionSpecLoaded bool
+}
+
+// NewManifestReader returns a value that can read the contents of an avro 
manifest
+// file. If the caller is interested in the manifest entries in the file, it 
must call
+// [ManifestReader.Entries] before closing the provided reader.
+func NewManifestReader(file ManifestFile, in io.Reader) (*ManifestReader, 
error) {
+       dec, err := ocf.NewDecoder(in, 
ocf.WithDecoderSchemaCache(&avro.SchemaCache{}))
+       if err != nil {
+               return nil, err
+       }
+
+       metadata := dec.Metadata()
+       sc := dec.Schema()
+
+       formatVersion, err := strconv.Atoi(string(metadata["format-version"]))
+       if err != nil {
+               return nil, fmt.Errorf("manifest file's 'format-version' 
metadata is invalid: %w", err)
+       }
+       if formatVersion != file.Version() {
+               return nil, fmt.Errorf("manifest file's 'format-version' 
metadata indicates version %d, but entry from manifest list indicates version 
%d",
+                       formatVersion, file.Version())
+       }
+
+       var content ManifestContent
+       switch contentStr := string(metadata["content"]); contentStr {
+       case "data":
+               content = ManifestContentData
+       case "deletes":
+               content = ManifestContentDeletes
+       default:
+               return nil, fmt.Errorf("manifest file's 'content' metadata is 
invalid, should be \"data\" or \"deletes\" but instead is %q",
+                       contentStr)
+       }
+       if content != file.ManifestContent() {
+               return nil, fmt.Errorf("manifest file's 'content' metadata 
indicates %q, but entry from manifest list indicates %q",
+                       content.String(), file.ManifestContent().String())
+       }
+
+       isFallback := false
+       if formatVersion == 1 {
+               for _, f := range sc.(*avro.RecordSchema).Fields() {
+                       if f.Name() == "snapshot_id" {
+                               if f.Type().Type() != avro.Union {
+                                       isFallback = true
+                               }
+
+                               break
+                       }
+               }
+       }
+       fieldNameToID, fieldIDToType := getFieldIDMap(sc)
+
+       return &ManifestReader{
+               dec:           dec,
+               file:          file,
+               formatVersion: formatVersion,
+               isFallback:    isFallback,
+               content:       content,
+               fieldNameToID: fieldNameToID,
+               fieldIDToType: fieldIDToType,
+       }, nil
+}
+
+// Version returns the file's format version.
+func (c *ManifestReader) Version() int {
+       return c.formatVersion
+}
+
+// ManifestContent returns the type of content in the manifest file.
+func (c *ManifestReader) ManifestContent() ManifestContent {
+       return c.content
+}
+
+// SchemaID returns the schema ID encoded in the avro file's metadata.
+func (c *ManifestReader) SchemaID() (int, error) {
+       id, err := strconv.Atoi(string(c.dec.Metadata()["schema-id"]))
+       if err != nil {
+               return 0, fmt.Errorf("manifest file's 'schema-id' metadata is 
invalid: %w", err)
+       }
+
+       return id, nil
+}
+
+// Schema returns the schema encoded in the avro file's metadata.
+func (c *ManifestReader) Schema() (*Schema, error) {
+       if !c.schemaLoaded {
+               schemaID, err := c.SchemaID()
+               if err != nil {
+                       return nil, err
+               }
+               if err := json.Unmarshal(c.dec.Metadata()["schema"], 
&c.schema); err != nil {
+                       return nil, fmt.Errorf("manifest file's 'schema' 
metadata is invalid: %w", err)
+               }
+               c.schema.ID = schemaID
+               c.schemaLoaded = true
+       }

Review Comment:
   I'd be fine with updating the docs to note that the reader isn't goroutine 
safe, that's sufficient for now unless someone gets bit by it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Reply via email to