jhump commented on code in PR #417: URL: https://github.com/apache/iceberg-go/pull/417#discussion_r2075959498
########## manifest.go: ########## @@ -614,6 +559,211 @@ func decodeManifests[I interface { return results, dec.Error() } +// ManifestReader reads the metadata and data from an avro manifest file. +type ManifestReader struct { + dec *ocf.Decoder + file ManifestFile + formatVersion int + isFallback bool + content ManifestContent + fieldNameToID map[string]int + fieldIDToType map[int]avro.LogicalType + + // The rest are lazily populated, on demand. Most readers + // will likely only try to load the entries. + schema Schema + schemaLoaded bool + partitionSpec PartitionSpec + partitionSpecLoaded bool +} + +// NewManifestReader returns a value that can read the contents of an avro manifest +// file. If the caller is interested in the manifest entries in the file, it must call +// [ManifestReader.Entries] before closing the provided reader. +func NewManifestReader(file ManifestFile, in io.Reader) (*ManifestReader, error) { + dec, err := ocf.NewDecoder(in, ocf.WithDecoderSchemaCache(&avro.SchemaCache{})) + if err != nil { + return nil, err + } + + metadata := dec.Metadata() + sc := dec.Schema() + + formatVersion, err := strconv.Atoi(string(metadata["format-version"])) + if err != nil { + return nil, fmt.Errorf("manifest file's 'format-version' metadata is invalid: %w", err) + } + if formatVersion != file.Version() { + return nil, fmt.Errorf("manifest file's 'format-version' metadata indicates version %d, but entry from manifest list indicates version %d", + formatVersion, file.Version()) + } + + var content ManifestContent + switch contentStr := string(metadata["content"]); contentStr { + case "data": + content = ManifestContentData + case "deletes": + content = ManifestContentDeletes + default: + return nil, fmt.Errorf("manifest file's 'content' metadata is invalid, should be \"data\" or \"deletes\" but instead is %q", + contentStr) + } + if content != file.ManifestContent() { + return nil, fmt.Errorf("manifest file's 'content' metadata indicates %q, but entry from manifest list indicates %q", + content.String(), file.ManifestContent().String()) + } + + isFallback := false + if formatVersion == 1 { + for _, f := range sc.(*avro.RecordSchema).Fields() { + if f.Name() == "snapshot_id" { + if f.Type().Type() != avro.Union { + isFallback = true + } + + break + } + } + } + fieldNameToID, fieldIDToType := getFieldIDMap(sc) + + return &ManifestReader{ + dec: dec, + file: file, + formatVersion: formatVersion, + isFallback: isFallback, + content: content, + fieldNameToID: fieldNameToID, + fieldIDToType: fieldIDToType, + }, nil +} + +// Version returns the file's format version. +func (c *ManifestReader) Version() int { + return c.formatVersion +} + +// ManifestContent returns the type of content in the manifest file. +func (c *ManifestReader) ManifestContent() ManifestContent { + return c.content +} + +// SchemaID returns the schema ID encoded in the avro file's metadata. +func (c *ManifestReader) SchemaID() (int, error) { + id, err := strconv.Atoi(string(c.dec.Metadata()["schema-id"])) + if err != nil { + return 0, fmt.Errorf("manifest file's 'schema-id' metadata is invalid: %w", err) + } + + return id, nil +} + +// Schema returns the schema encoded in the avro file's metadata. +func (c *ManifestReader) Schema() (*Schema, error) { + if !c.schemaLoaded { + schemaID, err := c.SchemaID() + if err != nil { + return nil, err + } + if err := json.Unmarshal(c.dec.Metadata()["schema"], &c.schema); err != nil { + return nil, fmt.Errorf("manifest file's 'schema' metadata is invalid: %w", err) + } + c.schema.ID = schemaID + c.schemaLoaded = true + } Review Comment: That is a very good question. I am not used to "readers" necessarily being thread-safe, so my inclination would be to not change it. But that does suggest I should update the docs to be very clear about that. I'm happy to make it thread-safe, if you think that would be a better DX and less likely for users to get themselves in trouble. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org