Re: [PR] feat(evaluator): Add strict metrics evaluator [iceberg-go]

via GitHub Tue, 05 Aug 2025 12:48:30 -0700


zeroshade commented on code in PR #522:
URL: https://github.com/apache/iceberg-go/pull/522#discussion_r2255187453



##########
table/evaluators.go:
##########
@@ -1195,3 +1195,369 @@ func (m *inclusiveMetricsEval) VisitNotStartsWith(t 
iceberg.BoundTerm, lit icebe
 
        return rowsMightMatch
 }
+
+func newStrictMetricsEvaluator(s *iceberg.Schema, expr 
iceberg.BooleanExpression,
+       caseSensitive bool, includeEmptyFiles bool,
+) (func(iceberg.DataFile) (bool, error), error) {
+       rewritten, err := iceberg.RewriteNotExpr(expr)
+       if err != nil {
+               return nil, err
+       }
+
+       bound, err := iceberg.BindExpr(s, rewritten, caseSensitive)
+       if err != nil {
+               return nil, err
+       }
+
+       return (&strictMetricsEval{
+               st:                s.AsStruct(),
+               includeEmptyFiles: includeEmptyFiles,
+               expr:              bound,
+       }).Eval, nil
+}
+
+type strictMetricsEval struct {
+       metricsEvaluator
+
+       st                iceberg.StructType
+       expr              iceberg.BooleanExpression
+       includeEmptyFiles bool
+}
+
+func (m *strictMetricsEval) Eval(file iceberg.DataFile) (bool, error) {
+       if !m.includeEmptyFiles && file.Count() <= 0 {
+               return rowsMustMatch, nil
+       }
+
+       // avoid race condition while maintaining existing state
+       ev := strictMetricsEval{
+               st:                m.st,
+               includeEmptyFiles: m.includeEmptyFiles,
+               expr:              m.expr,
+       }
+
+       ev.valueCounts, ev.nullCounts = file.ValueCounts(), 
file.NullValueCounts()
+       ev.nanCounts = file.NaNValueCounts()
+       ev.lowerBounds, ev.upperBounds = file.LowerBoundValues(), 
file.UpperBoundValues()
+
+       return iceberg.VisitExpr(m.expr, &ev)
+}
+
+func (m *strictMetricsEval) VisitUnbound(iceberg.UnboundPredicate) bool {
+       panic("need bound predicate")
+}
+
+func (m *strictMetricsEval) VisitBound(pred iceberg.BoundPredicate) bool {
+       return iceberg.VisitBoundPredicate(pred, m)
+}
+
+func (m *strictMetricsEval) VisitIsNull(t iceberg.BoundTerm) bool {
+       fieldID := t.Ref().Field().ID
+       if m.containsNullsOnly(fieldID) {
+               return rowsMustMatch
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotNull(t iceberg.BoundTerm) bool {
+       fieldID := t.Ref().Field().ID
+       if cnt, exists := m.nullCounts[fieldID]; exists && cnt == 0 {
+               return rowsMustMatch
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitIsNan(t iceberg.BoundTerm) bool {
+       fieldID := t.Ref().Field().ID
+
+       if m.containsNansOnly(fieldID) {
+               return rowsMustMatch
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotNan(t iceberg.BoundTerm) bool {
+       fieldID := t.Ref().Field().ID
+
+       if cnt, exists := m.nanCounts[fieldID]; exists && cnt == 0 {
+               return rowsMustMatch
+       }
+
+       if m.containsNullsOnly(fieldID) {
+               return rowsMustMatch
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitLess(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMightNotMatch
+       }
+
+       if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+               upperBound, err := iceberg.LiteralFromBytes(field.Type, 
upperBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if getCmpLiteral(upperBound)(upperBound, lit) < 0 {
+                       return rowsMustMatch
+               }
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitLessEqual(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMightNotMatch
+       }
+
+       if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+               upperBound, err := iceberg.LiteralFromBytes(field.Type, 
upperBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if getCmpLiteral(upperBound)(upperBound, lit) <= 0 {
+                       return rowsMustMatch
+               }
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitGreater(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMightNotMatch
+       }
+
+       if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+               lowerBound, err := iceberg.LiteralFromBytes(field.Type, 
lowerBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if m.isNan(lowerBound) {
+                       // NaN indicates unreliable bounds.
+                       return rowsMightNotMatch
+               }
+
+               if getCmpLiteral(lowerBound)(lowerBound, lit) > 0 {
+                       return rowsMustMatch
+               }
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitGreaterEqual(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMightNotMatch
+       }
+
+       if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+               lowerBound, err := iceberg.LiteralFromBytes(field.Type, 
lowerBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if m.isNan(lowerBound) {
+                       // NaN indicates unreliable bounds.
+                       return rowsMightNotMatch
+               }
+
+               if getCmpLiteral(lowerBound)(lowerBound, lit) >= 0 {
+                       return rowsMustMatch
+               }
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitEqual(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMightNotMatch
+       }
+
+       lowerBytes := m.lowerBounds[fieldID]
+       upperBytes := m.upperBounds[fieldID]
+
+       if lowerBytes != nil && upperBytes != nil {
+               lowerBound, err := iceberg.LiteralFromBytes(field.Type, 
lowerBytes)
+               if err != nil {
+                       panic(err)
+               }
+               upperBound, err := iceberg.LiteralFromBytes(field.Type, 
upperBytes)
+               if err != nil {
+                       panic(err)
+               }
+               if getCmpLiteral(lowerBound)(lowerBound, lit) != 0 || 
getCmpLiteral(upperBound)(upperBound, lit) != 0 {
+                       return rowsMightNotMatch
+               } else {
+                       return rowsMustMatch
+               }
+       }
+
+       return rowsMightNotMatch
+}
+
+func (m *strictMetricsEval) VisitNotEqual(t iceberg.BoundTerm, lit 
iceberg.Literal) bool {
+       field := t.Ref().Field()
+       fieldID := field.ID
+
+       if m.canContainNulls(fieldID) || m.canContainNans(fieldID) {
+               return rowsMustMatch
+       }
+
+       var cmp func(iceberg.Literal, iceberg.Literal) int
+       if lowerBoundBytes := m.lowerBounds[fieldID]; lowerBoundBytes != nil {
+               lowerBound, err := iceberg.LiteralFromBytes(field.Type, 
lowerBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if m.isNan(lowerBound) {
+                       return rowsMightNotMatch
+               }
+
+               cmp = getCmpLiteral(lowerBound)
+               if cmp(lowerBound, lit) == 1 {
+                       return rowsMustMatch
+               }
+       }
+
+       if upperBoundBytes := m.upperBounds[fieldID]; upperBoundBytes != nil {
+               upperBound, err := iceberg.LiteralFromBytes(field.Type, 
upperBoundBytes)
+               if err != nil {
+                       panic(err)
+               }
+
+               if m.isNan(upperBound) {
+                       return rowsMightNotMatch
+               }
+
+               if cmp(upperBound, lit) == -1 {
+                       return rowsMustMatch
+               }

Review Comment:
   if there was no lower bound in the stats, this will panic because `cmp` will 
be nil. It'll never get set if there's no lower bound. we should avoid that 
:smile:



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Re: [PR] feat(evaluator): Add strict metrics evaluator [iceberg-go]

Reply via email to