kevinjqliu commented on code in PR #329: URL: https://github.com/apache/iceberg-go/pull/329#discussion_r1989722553
########## table/arrow_utils.go: ########## @@ -892,3 +899,356 @@ func ToRequestedSchema(ctx context.Context, requested, fileSchema *iceberg.Schem return out, nil } + +func must[T any](v T, err error) T { + if err != nil { + panic(err) + } + + return v +} + +type metricModeType string + +const ( + metricModeTruncate metricModeType = "truncate" + metricModeNone metricModeType = "none" + metricModeCounts metricModeType = "counts" + metricModeFull metricModeType = "full" +) + +type metricsMode struct { + typ metricModeType + len int +} + +var truncationExpr = regexp.MustCompile(`^truncate\((\d+)\)$`) + +func matchMetricsMode(mode string) (metricsMode, error) { + sanitized := strings.ToLower(strings.TrimSpace(mode)) + if strings.HasPrefix(sanitized, string(metricModeTruncate)) { + m := truncationExpr.FindStringSubmatch(sanitized) + if len(m) < 2 { + return metricsMode{}, fmt.Errorf("malformed truncate metrics mode: %s", mode) + } + + truncLen, err := strconv.Atoi(m[1]) + if err != nil { + return metricsMode{}, fmt.Errorf("malformed truncate metrics mode: %s", mode) + } + + if truncLen <= 0 { + return metricsMode{}, fmt.Errorf("invalid truncate length: %d", truncLen) + } + + return metricsMode{typ: metricModeTruncate, len: truncLen}, nil + } + + switch sanitized { + case string(metricModeNone): + return metricsMode{typ: metricModeNone}, nil + case string(metricModeCounts): + return metricsMode{typ: metricModeCounts}, nil + case string(metricModeFull): + return metricsMode{typ: metricModeFull}, nil + default: + return metricsMode{}, fmt.Errorf("unsupported metrics mode: %s", mode) + } +} + +type statisticsCollector struct { + fieldID int + icebergTyp iceberg.PrimitiveType + mode metricsMode + colName string +} + +type arrowStatsCollector struct { + fieldID int + schema *iceberg.Schema + props iceberg.Properties + defaultMode string +} + +func (a *arrowStatsCollector) Schema(_ *iceberg.Schema, results func() []statisticsCollector) []statisticsCollector { + return results() +} + +func (a *arrowStatsCollector) Struct(_ iceberg.StructType, results []func() []statisticsCollector) []statisticsCollector { + result := make([]statisticsCollector, 0, len(results)) + for _, res := range results { + result = append(result, res()...) + } + + return result +} + +func (a *arrowStatsCollector) Field(field iceberg.NestedField, fieldRes func() []statisticsCollector) []statisticsCollector { + a.fieldID = field.ID + + return fieldRes() +} + +func (a *arrowStatsCollector) List(list iceberg.ListType, elemResult func() []statisticsCollector) []statisticsCollector { + a.fieldID = list.ElementID + + return elemResult() +} + +func (a *arrowStatsCollector) Map(m iceberg.MapType, keyResult func() []statisticsCollector, valResult func() []statisticsCollector) []statisticsCollector { + a.fieldID = m.KeyID + keyRes := keyResult() + + a.fieldID = m.ValueID + valRes := valResult() + + return append(keyRes, valRes...) +} + +func (a *arrowStatsCollector) Primitive(dt iceberg.PrimitiveType) []statisticsCollector { + colName, ok := a.schema.FindColumnName(a.fieldID) + if !ok { + return []statisticsCollector{} + } + + metMode, err := matchMetricsMode(a.defaultMode) + if err != nil { + panic(err) + } + + colMode, ok := a.props[MetricsModeColumnConfPrefix+"."+colName] + if ok { + metMode, err = matchMetricsMode(colMode) + if err != nil { + panic(err) + } + } + + switch dt.(type) { + case iceberg.StringType: + case iceberg.BinaryType: + default: + if metMode.typ == metricModeTruncate { + metMode = metricsMode{typ: metricModeFull, len: 0} + } + } + + isNested := strings.Contains(colName, ".") Review Comment: this might be an issue for columns with `.` in its name -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org