rahil-c commented on code in PR #18375:
URL: https://github.com/apache/hudi/pull/18375#discussion_r3069732439
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestSparkSqlCoreFlow.scala:
##########
@@ -47,24 +48,43 @@ class TestSparkSqlCoreFlow extends HoodieSparkSqlTestBase {
val colsToCompare = "timestamp, _row_key, partition_path, rider, driver,
begin_lat, begin_lon, end_lat, end_lon, fare.amount, fare.currency,
_hoodie_is_deleted"
//params for core flow tests
- val params: List[String] = List(
-
"COPY_ON_WRITE|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM",
-
"COPY_ON_WRITE|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM",
-
"COPY_ON_WRITE|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE",
-
"COPY_ON_WRITE|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE",
-
"COPY_ON_WRITE|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM",
-
"COPY_ON_WRITE|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM",
-
"COPY_ON_WRITE|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE",
-
"COPY_ON_WRITE|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE",
-
"MERGE_ON_READ|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM",
-
"MERGE_ON_READ|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM",
-
"MERGE_ON_READ|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE",
-
"MERGE_ON_READ|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE",
-
"MERGE_ON_READ|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM",
-
"MERGE_ON_READ|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM",
-
"MERGE_ON_READ|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE",
-
"MERGE_ON_READ|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE"
- )
+ val params: List[String] = {
+ val allParams: List[String] = List(
+
"COPY_ON_WRITE|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM|parquet",
+
"COPY_ON_WRITE|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM|parquet",
+
"COPY_ON_WRITE|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE|parquet",
+
"COPY_ON_WRITE|true|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_SIMPLE|parquet",
+
"COPY_ON_WRITE|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM|parquet",
+
"COPY_ON_WRITE|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|BLOOM|parquet",
+
"COPY_ON_WRITE|false|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE|parquet",
+
"COPY_ON_WRITE|true|org.apache.hudi.keygen.NonpartitionedKeyGenerator|SIMPLE|parquet",
+
"MERGE_ON_READ|false|org.apache.hudi.keygen.SimpleKeyGenerator|GLOBAL_BLOOM|parquet",
Review Comment:
**[Medium]** The test matrix doubled from 16→32 configs for `params` and
32→64 for `paramsForImmutable`, adding ~64 new parameterized test runs. Each
creates tables, writes, reads, and does compaction.
Consider reducing the Lance cross-product to a representative subset (e.g.
one partitioned + one non-partitioned keygen, one index type) rather than the
full cartesian product. Lance-specific edge cases are better covered by
targeted tests like the ones in `TestLanceDataSource`, not by running every
keygen/index permutation.
##########
hudi-spark-datasource/hudi-spark/src/test/scala/org/apache/hudi/functional/TestLanceDataSource.scala:
##########
@@ -21,7 +21,7 @@ import org.apache.hudi.DataSourceWriteOptions._
import org.apache.hudi.DefaultSparkRecordMerger
import org.apache.hudi.common.config.{HoodieCommonConfig, HoodieMetadataConfig}
import org.apache.hudi.common.engine.HoodieLocalEngineContext
-import org.apache.hudi.common.model.HoodieTableType
+import org.apache.hudi.common.model.{HoodieFileFormat, HoodieTableType}
Review Comment:
**[Medium]** The `SparkLanceReaderBase` change fixes `count(*)` returning 0
for Lance (by opening the file to get row count instead of returning
`Iterator.empty`). But there's no test that verifies `SELECT count(*) FROM
lance_table` returns the correct count. A one-liner assertion in
`testBasicSqlInsertOperations` would cover this.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]