nastra commented on code in PR #9176: URL: https://github.com/apache/iceberg/pull/9176#discussion_r1473941848
########## spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/sql/TestAggregatePushDown.java: ########## @@ -478,6 +480,126 @@ public void testAggregateWithComplexType() { .isFalse(); } + @TestTemplate + public void testAggregationPushdownStructInteger() { + sql("CREATE TABLE %s (id BIGINT, struct_with_int STRUCT<c1:BIGINT>) USING iceberg", tableName); + sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", NULL))", tableName); + sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 2))", tableName); + sql("INSERT INTO TABLE %s VALUES (3, named_struct(\"c1\", 3))", tableName); + + String query = "SELECT COUNT(%s), MAX(%s), MIN(%s) FROM %s"; + String aggField = "struct_with_int.c1"; + assertAggregates(sql(query, aggField, aggField, aggField, tableName), 2L, 3L, 2L); + assertExplainContains( + sql("EXPLAIN " + query, aggField, aggField, aggField, tableName), + "count(struct_with_int.c1)", + "max(struct_with_int.c1)", + "min(struct_with_int.c1)"); + } + + @TestTemplate + public void testAggregationPushdownNestedStruct() { + sql( + "CREATE TABLE %s (id BIGINT, struct_with_int STRUCT<c1:STRUCT<c2:STRUCT<c3:STRUCT<c4:BIGINT>>>>) USING iceberg", + tableName); + sql( + "INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", named_struct(\"c2\", named_struct(\"c3\", named_struct(\"c4\", NULL)))))", + tableName); + sql( + "INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", named_struct(\"c2\", named_struct(\"c3\", named_struct(\"c4\", 2)))))", + tableName); + sql( + "INSERT INTO TABLE %s VALUES (3, named_struct(\"c1\", named_struct(\"c2\", named_struct(\"c3\", named_struct(\"c4\", 3)))))", + tableName); + + String query = "SELECT COUNT(%s), MAX(%s), MIN(%s) FROM %s"; + String aggField = "struct_with_int.c1.c2.c3.c4"; + + assertAggregates(sql(query, aggField, aggField, aggField, tableName), 2L, 3L, 2L); + + assertExplainContains( + sql("EXPLAIN " + query, aggField, aggField, aggField, tableName), + "count(struct_with_int.c1.c2.c3.c4)", + "max(struct_with_int.c1.c2.c3.c4)", + "min(struct_with_int.c1.c2.c3.c4)"); + } + + @TestTemplate + public void testAggregationPushdownStructTimestamp() { + sql( + "CREATE TABLE %s (id BIGINT, struct_with_ts STRUCT<c1:TIMESTAMP>) USING iceberg", + tableName); + sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", NULL))", tableName); + sql( + "INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", timestamp('2023-01-30T22:22:22Z')))", + tableName); + sql( + "INSERT INTO TABLE %s VALUES (3, named_struct(\"c1\", timestamp('2023-01-30T22:23:23Z')))", + tableName); + + String query = "SELECT COUNT(%s), MAX(%s), MIN(%s) FROM %s"; + String aggField = "struct_with_ts.c1"; + + assertAggregates( + sql(query, aggField, aggField, aggField, tableName), + 2L, + new Timestamp(1675117403000L), + new Timestamp(1675117342000L)); + + assertExplainContains( + sql("EXPLAIN " + query, aggField, aggField, aggField, tableName), + "count(struct_with_ts.c1)", + "max(struct_with_ts.c1)", + "min(struct_with_ts.c1)"); + } + + @TestTemplate + public void testAggregationPushdownOnBucketedColumn() { + sql( + "CREATE TABLE %s (id BIGINT, struct_with_int STRUCT<c1:INT>) USING iceberg PARTITIONED BY (bucket(8, id))", + tableName); + + sql("INSERT INTO TABLE %s VALUES (1, named_struct(\"c1\", NULL))", tableName); + sql("INSERT INTO TABLE %s VALUES (null, named_struct(\"c1\", 2))", tableName); + sql("INSERT INTO TABLE %s VALUES (2, named_struct(\"c1\", 3))", tableName); + + String query = "SELECT COUNT(%s), MAX(%s), MIN(%s) FROM %s"; + String aggField = "id"; + assertAggregates(sql(query, aggField, aggField, aggField, tableName), 2L, 2L, 1L); + assertExplainContains( + sql("EXPLAIN " + query, aggField, aggField, aggField, tableName), + "count(id)", + "max(id)", + "min(id)"); + } + + private void assertAggregates( + List<Object[]> actual, Object expectedCount, Object expectedMax, Object expectedMin) { + Object actualCount = actual.get(0)[0]; + Object actualMax = actual.get(0)[1]; + Object actualMin = actual.get(0)[2]; + + Assertions.assertThat(actualCount) + .as("Expected and actual count should equal") + .isEqualTo(expectedCount); + Assertions.assertThat(actualMax) + .as("Expected and actual max should equal") + .isEqualTo(expectedMax); + Assertions.assertThat(actualMin) + .as("Expected and actual min should equal") + .isEqualTo(expectedMin); + } + + private void assertExplainContains(List<Object[]> explain, String... expectedFragments) { + String explainString = explain.get(0)[0].toString().toLowerCase(Locale.ROOT); + Arrays.stream(expectedFragments) + .forEach( + fragment -> + Assertions.assertThat(explainString.contains(fragment)) Review Comment: this should be `Assertions.assertThat(explainString).as(...).contains(fragment)`. We typically try to avoid usage of `isTrue()` / `isFalse()` on assertions like these because they don't provide any contextual insight when an assertion fails. On the other hand, using `assertThat(explainString).as(...).contains(fragment)` will always show the content of `explainString` and `fragment` in case the assertion fails. Also the `.as()` typically needs to be specified before the final assertion and will be ignored otherwise. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org