zeroshade commented on code in PR #417: URL: https://github.com/apache/iceberg-go/pull/417#discussion_r2075982283
########## manifest.go: ########## @@ -614,6 +559,211 @@ func decodeManifests[I interface { return results, dec.Error() } +// ManifestReader reads the metadata and data from an avro manifest file. +type ManifestReader struct { + dec *ocf.Decoder + file ManifestFile + formatVersion int + isFallback bool + content ManifestContent + fieldNameToID map[string]int + fieldIDToType map[int]avro.LogicalType + + // The rest are lazily populated, on demand. Most readers + // will likely only try to load the entries. + schema Schema + schemaLoaded bool + partitionSpec PartitionSpec + partitionSpecLoaded bool +} + +// NewManifestReader returns a value that can read the contents of an avro manifest +// file. If the caller is interested in the manifest entries in the file, it must call +// [ManifestReader.Entries] before closing the provided reader. +func NewManifestReader(file ManifestFile, in io.Reader) (*ManifestReader, error) { + dec, err := ocf.NewDecoder(in, ocf.WithDecoderSchemaCache(&avro.SchemaCache{})) + if err != nil { + return nil, err + } + + metadata := dec.Metadata() + sc := dec.Schema() + + formatVersion, err := strconv.Atoi(string(metadata["format-version"])) + if err != nil { + return nil, fmt.Errorf("manifest file's 'format-version' metadata is invalid: %w", err) + } + if formatVersion != file.Version() { + return nil, fmt.Errorf("manifest file's 'format-version' metadata indicates version %d, but entry from manifest list indicates version %d", + formatVersion, file.Version()) + } + + var content ManifestContent + switch contentStr := string(metadata["content"]); contentStr { + case "data": + content = ManifestContentData + case "deletes": + content = ManifestContentDeletes + default: + return nil, fmt.Errorf("manifest file's 'content' metadata is invalid, should be \"data\" or \"deletes\" but instead is %q", + contentStr) + } + if content != file.ManifestContent() { + return nil, fmt.Errorf("manifest file's 'content' metadata indicates %q, but entry from manifest list indicates %q", + content.String(), file.ManifestContent().String()) + } + + isFallback := false + if formatVersion == 1 { + for _, f := range sc.(*avro.RecordSchema).Fields() { + if f.Name() == "snapshot_id" { + if f.Type().Type() != avro.Union { + isFallback = true + } + + break + } + } + } + fieldNameToID, fieldIDToType := getFieldIDMap(sc) + + return &ManifestReader{ + dec: dec, + file: file, + formatVersion: formatVersion, + isFallback: isFallback, + content: content, + fieldNameToID: fieldNameToID, + fieldIDToType: fieldIDToType, + }, nil +} + +// Version returns the file's format version. +func (c *ManifestReader) Version() int { + return c.formatVersion +} + +// ManifestContent returns the type of content in the manifest file. +func (c *ManifestReader) ManifestContent() ManifestContent { + return c.content +} + +// SchemaID returns the schema ID encoded in the avro file's metadata. +func (c *ManifestReader) SchemaID() (int, error) { + id, err := strconv.Atoi(string(c.dec.Metadata()["schema-id"])) + if err != nil { + return 0, fmt.Errorf("manifest file's 'schema-id' metadata is invalid: %w", err) + } + + return id, nil +} + +// Schema returns the schema encoded in the avro file's metadata. +func (c *ManifestReader) Schema() (*Schema, error) { + if !c.schemaLoaded { + schemaID, err := c.SchemaID() + if err != nil { + return nil, err + } + if err := json.Unmarshal(c.dec.Metadata()["schema"], &c.schema); err != nil { + return nil, fmt.Errorf("manifest file's 'schema' metadata is invalid: %w", err) + } + c.schema.ID = schemaID + c.schemaLoaded = true + } Review Comment: I'd be fine with updating the docs to note that the reader isn't goroutine safe, that's sufficient for now unless someone gets bit by it. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org