zeroshade commented on code in PR #522:
URL: https://github.com/apache/iceberg-go/pull/522#discussion_r2255187453
##########
table/evaluators.go:
##########
@@ -1195,3 +1195,369 @@ func (m *inclusiveMetricsEval) VisitNotStartsWith(t
iceberg.BoundTerm, lit icebe
return rowsMightMatch
}
+
+func newStrictMetricsEvaluator(s *iceberg.Schema, expr
iceberg.BooleanExpression,
+ caseSensitive bool, includeEmptyFiles bool,
+) (func(iceberg.DataFile) (bool, error), error) {
+ rewritten, err := iceberg.RewriteNotExpr(expr)
+ if err != nil {
+ return nil, err
+ }
+
+ bound, err := iceberg.BindExpr(s, rewritten, caseSensitive)
+ if err != nil {
+ return nil, err
+ }
+
+ return (&strictMetricsEval{
+ st: s.AsStruct(),
+ includeEmptyFiles: includeEmptyFiles,
+ expr: bound,
+ }).Eval, nil
+}
+
+type strictMetricsEval struct {
+ metricsEvaluator
+
+ st iceberg.StructType
+ expr iceberg.BooleanExpression
+ includeEmptyFiles bool
+}
+
+func (m *strictMetricsEval) Eval(file iceberg.DataFile) (bool, error) {
+ if !m.includeEmptyFiles && file.Count() <= 0 {
+ return rowsMustMatch, nil
+ }
+
+ // avoid race condition while maintaining existing state
+ ev := strictMetricsEval{
+ st: m.st,
+ includeEmptyFiles: m.includeEmptyFiles,
+ expr: m.expr,
+ }
+
+ ev.valueCounts, ev.nullCounts = file.ValueCounts(),
file.NullValueCounts()
+ ev.nanCounts = file.NaNValueCounts()
+ ev.lowerBounds, ev.upperBounds = file.LowerBoundValues(),
file.UpperBoundValues()
+
+ return iceberg.VisitExpr(m.expr, &ev)
+}
+
+func (m *strictMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool {
+ panic("need bound predicate")
+}
+
+func (m *strictMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool {
+ return iceberg.VisitBoundPredicate(pred, m)
+}
+
+func (m *strictMetricsEval) VisitIsNull(t iceberg.BoundTerm) bool {
+ fieldID := t.Ref().Field().ID
+ if m.containsNullsOnly(fieldID) {
+ return rowsMustMatch
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotNull(t iceberg.BoundTerm) bool {
+ fieldID := t.Ref().Field().ID
+ if cnt, exists := m.nullCounts[fieldID]; exists && cnt == 0 {
+ return rowsMustMatch
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitIsNan(t iceberg.BoundTerm) bool {
+ fieldID := t.Ref().Field().ID
+
+ if m.containsNansOnly(fieldID) {
+ return rowsMustMatch
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotNan(t iceberg.BoundTerm) bool {
+ fieldID := t.Ref().Field().ID
+
+ if cnt, exists := m.nanCounts[fieldID]; exists && cnt == 0 {
+ return rowsMustMatch
+ }
+
+ if m.containsNullsOnly(fieldID) {
+ return rowsMustMatch
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitLess(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMightNotMatch
+ }
+
+ if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+ upperBound, err := iceberg.LiteralFromBytes(field.Type,
upperBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(upperBound)(upperBound, lit) < 0 {
+ return rowsMustMatch
+ }
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitLessEqual(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMightNotMatch
+ }
+
+ if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+ upperBound, err := iceberg.LiteralFromBytes(field.Type,
upperBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if getCmpLiteral(upperBound)(upperBound, lit) <= 0 {
+ return rowsMustMatch
+ }
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitGreater(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMightNotMatch
+ }
+
+ if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+ lowerBound, err := iceberg.LiteralFromBytes(field.Type,
lowerBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if m.isNan(lowerBound) {
+ // NaN indicates unreliable bounds.
+ return rowsMightNotMatch
+ }
+
+ if getCmpLiteral(lowerBound)(lowerBound, lit) > 0 {
+ return rowsMustMatch
+ }
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitGreaterEqual(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMightNotMatch
+ }
+
+ if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+ lowerBound, err := iceberg.LiteralFromBytes(field.Type,
lowerBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if m.isNan(lowerBound) {
+ // NaN indicates unreliable bounds.
+ return rowsMightNotMatch
+ }
+
+ if getCmpLiteral(lowerBound)(lowerBound, lit) >= 0 {
+ return rowsMustMatch
+ }
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitEqual(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMightNotMatch
+ }
+
+ lowerBytes := m.lowerBounds[fieldID]
+ upperBytes := m.upperBounds[fieldID]
+
+ if lowerBytes != nil && upperBytes != nil {
+ lowerBound, err := iceberg.LiteralFromBytes(field.Type,
lowerBytes)
+ if err != nil {
+ panic(err)
+ }
+ upperBound, err := iceberg.LiteralFromBytes(field.Type,
upperBytes)
+ if err != nil {
+ panic(err)
+ }
+ if getCmpLiteral(lowerBound)(lowerBound, lit) != 0 ||
getCmpLiteral(upperBound)(upperBound, lit) != 0 {
+ return rowsMightNotMatch
+ } else {
+ return rowsMustMatch
+ }
+ }
+
+ return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotEqual(t iceberg.BoundTerm, lit
iceberg.Literal) bool {
+ field := t.Ref().Field()
+ fieldID := field.ID
+
+ if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+ return rowsMustMatch
+ }
+
+ var cmp func(iceberg.Literal, iceberg.Literal) int
+ if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+ lowerBound, err := iceberg.LiteralFromBytes(field.Type,
lowerBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if m.isNan(lowerBound) {
+ return rowsMightNotMatch
+ }
+
+ cmp = getCmpLiteral(lowerBound)
+ if cmp(lowerBound, lit) == 1 {
+ return rowsMustMatch
+ }
+ }
+
+ if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+ upperBound, err := iceberg.LiteralFromBytes(field.Type,
upperBoundBytes)
+ if err != nil {
+ panic(err)
+ }
+
+ if m.isNan(upperBound) {
+ return rowsMightNotMatch
+ }
+
+ if cmp(upperBound, lit) == -1 {
+ return rowsMustMatch
+ }
Review Comment:
if there was no lower bound in the stats, this will panic because `cmp` will
be nil. It'll never get set if there's no lower bound. we should avoid that
:smile:
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]