szehon-ho commented on code in PR #10203: URL: https://github.com/apache/iceberg/pull/10203#discussion_r1640361591
########## core/src/main/java/org/apache/iceberg/BaseEntriesTable.java: ########## @@ -94,6 +105,188 @@ static CloseableIterable<FileScanTask> planFiles( table, manifest, projectedSchema, schemaString, specString, residuals)); } + /** + * Evaluates an {@link Expression} on a {@link ManifestFile} to test whether a given data or + * delete manifests shall be included in the scan + */ + private static class ManifestContentEvaluator { + + private static final Set<Integer> DELETE_CONTENT_SET = + ImmutableSet.of(FileContent.EQUALITY_DELETES.id(), FileContent.POSITION_DELETES.id()); Review Comment: Nit: I feel we can just inline this, if its just used in one place? eg., content == FileContent.EQUALITY_DELETES || content == FileCOntent.POSITION_DELETES ########## core/src/main/java/org/apache/iceberg/BaseEntriesTable.java: ########## @@ -94,6 +105,188 @@ static CloseableIterable<FileScanTask> planFiles( table, manifest, projectedSchema, schemaString, specString, residuals)); } + /** + * Evaluates an {@link Expression} on a {@link ManifestFile} to test whether a given data or + * delete manifests shall be included in the scan + */ + private static class ManifestContentEvaluator { + + private static final Set<Integer> DELETE_CONTENT_SET = + ImmutableSet.of(FileContent.EQUALITY_DELETES.id(), FileContent.POSITION_DELETES.id()); + private final Expression boundExpr; + + private ManifestContentEvaluator( + Expression expr, Types.StructType structType, boolean caseSensitive) { + Expression rewritten = Expressions.rewriteNot(expr); + this.boundExpr = Binder.bind(structType, rewritten, caseSensitive); + } + + private boolean eval(ManifestFile manifest) { + return new ManifestEvalVisitor().eval(manifest); + } + + private class ManifestEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor<Boolean> { + + private int manifestContentId; + + private static final boolean ROWS_MIGHT_MATCH = true; + private static final boolean ROWS_CANNOT_MATCH = false; + + private boolean eval(ManifestFile manifestFile) { + this.manifestContentId = manifestFile.content().id(); + return ExpressionVisitors.visitEvaluator(boundExpr, this); + } + + @Override + public Boolean alwaysTrue() { + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean alwaysFalse() { + return ROWS_CANNOT_MATCH; + } + + @Override + public Boolean not(Boolean result) { + return !result; + } + + @Override + public Boolean and(Boolean leftResult, Boolean rightResult) { + return leftResult && rightResult; + } + + @Override + public Boolean or(Boolean leftResult, Boolean rightResult) { + return leftResult || rightResult; + } + + @Override + public <T> Boolean isNull(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be null + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNull(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean isNaN(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be nan + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNaN(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult < 0); + } + + @Override + public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult <= 0); + } + + @Override + public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult > 0); + } + + @Override + public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult >= 0); + } + + @Override + public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult == 0); + } + + @Override + public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult != 0); + } + + @Override + public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (!literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + /** + * Comparison of data file content and literal, using integer comparator. + * + * @param ref bound reference, comparison attempted only if reference is for date_file.content + * @param lit literal value to compare with date file content. + * @param desiredResult function to apply to int comparator result, returns true if result is + * as expected. + * @return false if comparator does not achieve desired result, true otherwise + */ + private <T> Boolean compareDateFileContent( Review Comment: Is it overengineering to have desiredResult lambda here, if we support mostly comparing enums. Maybe have this method just return true if match (manifest/file content matches) and false if not match? ########## core/src/main/java/org/apache/iceberg/BaseEntriesTable.java: ########## @@ -94,6 +105,188 @@ static CloseableIterable<FileScanTask> planFiles( table, manifest, projectedSchema, schemaString, specString, residuals)); } + /** + * Evaluates an {@link Expression} on a {@link ManifestFile} to test whether a given data or + * delete manifests shall be included in the scan + */ + private static class ManifestContentEvaluator { + + private static final Set<Integer> DELETE_CONTENT_SET = + ImmutableSet.of(FileContent.EQUALITY_DELETES.id(), FileContent.POSITION_DELETES.id()); + private final Expression boundExpr; + + private ManifestContentEvaluator( + Expression expr, Types.StructType structType, boolean caseSensitive) { + Expression rewritten = Expressions.rewriteNot(expr); + this.boundExpr = Binder.bind(structType, rewritten, caseSensitive); + } + + private boolean eval(ManifestFile manifest) { + return new ManifestEvalVisitor().eval(manifest); + } + + private class ManifestEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor<Boolean> { + + private int manifestContentId; + + private static final boolean ROWS_MIGHT_MATCH = true; + private static final boolean ROWS_CANNOT_MATCH = false; + + private boolean eval(ManifestFile manifestFile) { + this.manifestContentId = manifestFile.content().id(); + return ExpressionVisitors.visitEvaluator(boundExpr, this); + } + + @Override + public Boolean alwaysTrue() { + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean alwaysFalse() { + return ROWS_CANNOT_MATCH; + } + + @Override + public Boolean not(Boolean result) { + return !result; + } + + @Override + public Boolean and(Boolean leftResult, Boolean rightResult) { + return leftResult && rightResult; + } + + @Override + public Boolean or(Boolean leftResult, Boolean rightResult) { + return leftResult || rightResult; + } + + @Override + public <T> Boolean isNull(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be null + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNull(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean isNaN(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be nan + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNaN(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult < 0); + } + + @Override + public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult <= 0); + } + + @Override + public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult > 0); + } + + @Override + public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult >= 0); + } + + @Override + public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult == 0); + } + + @Override + public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult != 0); + } + + @Override + public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (!literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + /** + * Comparison of data file content and literal, using integer comparator. + * + * @param ref bound reference, comparison attempted only if reference is for date_file.content + * @param lit literal value to compare with date file content. + * @param desiredResult function to apply to int comparator result, returns true if result is + * as expected. + * @return false if comparator does not achieve desired result, true otherwise + */ + private <T> Boolean compareDateFileContent( + BoundReference<T> ref, Literal<T> lit, Function<Integer, Boolean> desiredResult) { + if (isDateFileContent(ref)) { + Literal<Integer> intLit = lit.to(Types.IntegerType.get()); + int literalValueToCompare = toManifestContentId(intLit.value()); + int cmp = intLit.comparator().compare(manifestContentId, literalValueToCompare); + if (!desiredResult.apply(cmp)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + private <T> boolean isDateFileContent(BoundReference<T> ref) { Review Comment: wrong spelling 'date'. Let's call this method 'fileContent' I think dataFile content was also a confusing part of Iceberg code (probably because the class was first called DataFile and later Content was added to also let it support DeleteFile...). This also makes much code shorter as this method is used often. ########## core/src/main/java/org/apache/iceberg/BaseEntriesTable.java: ########## @@ -94,6 +105,188 @@ static CloseableIterable<FileScanTask> planFiles( table, manifest, projectedSchema, schemaString, specString, residuals)); } + /** + * Evaluates an {@link Expression} on a {@link ManifestFile} to test whether a given data or + * delete manifests shall be included in the scan + */ + private static class ManifestContentEvaluator { + + private static final Set<Integer> DELETE_CONTENT_SET = + ImmutableSet.of(FileContent.EQUALITY_DELETES.id(), FileContent.POSITION_DELETES.id()); + private final Expression boundExpr; + + private ManifestContentEvaluator( + Expression expr, Types.StructType structType, boolean caseSensitive) { + Expression rewritten = Expressions.rewriteNot(expr); + this.boundExpr = Binder.bind(structType, rewritten, caseSensitive); + } + + private boolean eval(ManifestFile manifest) { + return new ManifestEvalVisitor().eval(manifest); + } + + private class ManifestEvalVisitor extends ExpressionVisitors.BoundExpressionVisitor<Boolean> { + + private int manifestContentId; + + private static final boolean ROWS_MIGHT_MATCH = true; + private static final boolean ROWS_CANNOT_MATCH = false; + + private boolean eval(ManifestFile manifestFile) { + this.manifestContentId = manifestFile.content().id(); + return ExpressionVisitors.visitEvaluator(boundExpr, this); + } + + @Override + public Boolean alwaysTrue() { + return ROWS_MIGHT_MATCH; + } + + @Override + public Boolean alwaysFalse() { + return ROWS_CANNOT_MATCH; + } + + @Override + public Boolean not(Boolean result) { + return !result; + } + + @Override + public Boolean and(Boolean leftResult, Boolean rightResult) { + return leftResult && rightResult; + } + + @Override + public Boolean or(Boolean leftResult, Boolean rightResult) { + return leftResult || rightResult; + } + + @Override + public <T> Boolean isNull(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be null + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNull(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean isNaN(BoundReference<T> ref) { + if (isDateFileContent(ref)) { + return ROWS_CANNOT_MATCH; // date_file.content should not be nan + } else { + return ROWS_MIGHT_MATCH; + } + } + + @Override + public <T> Boolean notNaN(BoundReference<T> ref) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean lt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult < 0); + } + + @Override + public <T> Boolean ltEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult <= 0); + } + + @Override + public <T> Boolean gt(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult > 0); + } + + @Override + public <T> Boolean gtEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult >= 0); + } + + @Override + public <T> Boolean eq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult == 0); + } + + @Override + public <T> Boolean notEq(BoundReference<T> ref, Literal<T> lit) { + return compareDateFileContent(ref, lit, compareResult -> compareResult != 0); + } + + @Override + public <T> Boolean in(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (!literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notIn(BoundReference<T> ref, Set<T> literalSet) { + if (isDateFileContent(ref)) { + if (literalSet.contains(manifestContentId)) { + return ROWS_CANNOT_MATCH; + } + } + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean startsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + @Override + public <T> Boolean notStartsWith(BoundReference<T> ref, Literal<T> lit) { + return ROWS_MIGHT_MATCH; + } + + /** + * Comparison of data file content and literal, using integer comparator. + * + * @param ref bound reference, comparison attempted only if reference is for date_file.content + * @param lit literal value to compare with date file content. + * @param desiredResult function to apply to int comparator result, returns true if result is + * as expected. + * @return false if comparator does not achieve desired result, true otherwise + */ + private <T> Boolean compareDateFileContent( + BoundReference<T> ref, Literal<T> lit, Function<Integer, Boolean> desiredResult) { + if (isDateFileContent(ref)) { + Literal<Integer> intLit = lit.to(Types.IntegerType.get()); + int literalValueToCompare = toManifestContentId(intLit.value()); + int cmp = intLit.comparator().compare(manifestContentId, literalValueToCompare); Review Comment: I think this is a bit fragile. We are just depending on the fact that both Manifest content and dataFile content , id of data is less than id of delete. I would suggest not to support gt/lt pushdowns for that reason (you can always express it in terms of eq/neq) Can we not use comparator, and just use more hard coded equality cases? ie, ``` if (fileContentLit == DATA) return manifestContent == DATA else return manifestContent == DELETE ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org