Re: [PR] Spark: Add a test to check if the bloom filters are added to the parquet files [iceberg]

via GitHub Mon, 18 Mar 2024 01:08:38 -0700


nastra commented on code in PR #9902:
URL: https://github.com/apache/iceberg/pull/9902#discussion_r1528029346



##########
spark/v3.5/spark/src/test/java/org/apache/iceberg/spark/source/TestSparkReaderWithBloomFilter.java:
##########
@@ -174,170 +156,62 @@ public static Object[][] parameters() {
 
   @BeforeAll
   public static void startMetastoreAndSpark() {
-    metastore = new TestHiveMetastore();
-    metastore.start();
-    HiveConf hiveConf = metastore.hiveConf();
-
     spark =
         SparkSession.builder()
             .master("local[2]")
-            .config("spark.hadoop." + METASTOREURIS.varname, 
hiveConf.get(METASTOREURIS.varname))
-            .enableHiveSupport()
+            .config("spark.sql.catalog.local", 
"org.apache.iceberg.spark.SparkCatalog")
+            .config("spark.sql.catalog.local.type", "hadoop")
+            .config("spark.sql.catalog.local.warehouse", temp.toString())
+            .config("spark.sql.defaultCatalog", "local")
             .getOrCreate();
 
-    catalog =
-        (HiveCatalog)
-            CatalogUtil.loadCatalog(
-                HiveCatalog.class.getName(), "hive", ImmutableMap.of(), 
hiveConf);
-
-    try {
-      catalog.createNamespace(Namespace.of("default"));
-    } catch (AlreadyExistsException ignored) {
-      // the default namespace already exists. ignore the create error
-    }
+    spark.sql("CREATE DATABASE IF NOT EXISTS default");
+    spark.sql("USE default");
   }
 
   @AfterAll
-  public static void stopMetastoreAndSpark() throws Exception {
-    catalog = null;
-    metastore.stop();
-    metastore = null;
+  public static void stopMetastoreAndSpark() {
     spark.stop();
     spark = null;
   }
 
-  protected void createTable(String name, Schema schema) {
-    table = catalog.createTable(TableIdentifier.of("default", name), schema);
-    TableOperations ops = ((BaseTable) table).operations();
-    TableMetadata meta = ops.current();
-    ops.commit(meta, meta.upgradeToFormatVersion(2));
+  protected void createTable(String name) throws TableAlreadyExistsException {
+    Dataset<Row> emptyDf = spark.createDataFrame(Lists.newArrayList(), schema);
+    CreateTableWriter<Row> createTableWriter = emptyDf.writeTo("default." + 
name);
 
     if (useBloomFilter) {
-      table
-          .updateProperties()
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id", "true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_long", "true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_double", 
"true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_float", "true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_string", 
"true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_boolean", 
"true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_date", "true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_int_decimal", 
"true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + "id_long_decimal", 
"true")
-          .set(PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + 
"id_fixed_decimal", "true")
-          .commit();
+      String[] columns = {
+        "id",
+        "id_long",
+        "id_double",
+        "id_float",
+        "id_string",
+        "id_boolean",
+        "id_date",
+        "id_int_decimal",
+        "id_long_decimal",
+        "id_fixed_decimal",
+        "id_nested.nested_id"
+      };
+      for (String column : columns) {
+        createTableWriter.tableProperty(
+            PARQUET_BLOOM_FILTER_COLUMN_ENABLED_PREFIX + column, "true");
+      }
     }
 
-    table
-        .updateProperties()
-        .set(TableProperties.PARQUET_ROW_GROUP_SIZE_BYTES, "100") // to have 
multiple row groups
-        .commit();
-    if (vectorized) {
-      table
-          .updateProperties()
-          .set(TableProperties.PARQUET_VECTORIZATION_ENABLED, "true")
-          .set(TableProperties.PARQUET_BATCH_SIZE, "4")
-          .commit();
-    }
-  }
-
-  protected void dropTable(String name) {
-    catalog.dropTable(TableIdentifier.of("default", name));
-  }
+    createTableWriter.tableProperty(PARQUET_ROW_GROUP_SIZE_BYTES, "100");
 
-  private DataFile writeDataFile(OutputFile out, StructLike partition, 
List<Record> rows)

Review Comment:
   this seems like too many changes just to add a single test. This makes it 
quite difficult to review the diffset



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [PR] Spark: Add a test to check if the bloom filters are added to the parquet files [iceberg]

Reply via email to