harshith-bolar-rapido commented on issue #8333:
URL: https://github.com/apache/iceberg/issues/8333#issuecomment-1854204500
Noticed that this happens when the input folders are partitioned by date.
Ex: `yyyymmdd=20230304`
Here's a minimal reproducible example. @RussellSpitzer @nastra
```java
public class MRE {
private static SparkConf getSparkConf() {
SparkConf sparkConf = new SparkConf();
sparkConf.set("spark.master", "local[*]");
sparkConf.set("spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions");
// Catalog configs
sparkConf.set("hive.metastore.uris", "thrift://metastore-url:9083");
sparkConf.set("spark.sql.catalog.hive",
"org.apache.iceberg.spark.SparkCatalog");
sparkConf.set("spark.sql.catalog.hive.type", "hive");
sparkConf.set("spark.sql.catalog.hive.uri",
"thrift://metastore-url:9083");
sparkConf.set("spark.sql.catalogImplementation", "hive");
sparkConf.set("spark.sql.catalog.spark_catalog",
"org.apache.iceberg.spark.SparkSessionCatalog");
sparkConf.set("spark.sql.catalog.spark_catalog.type", "hive");
return sparkConf;
}
private static SparkSession getSparkSession(SparkConf conf) {
return SparkSession
.builder()
.config(conf)
.getOrCreate();
}
public static void main(String[] args) {
SparkSession spark = getSparkSession(getSparkConf());
Dataset<Row> df = spark
.read()
.option("basePath", "/tmp/")
.json("/tmp/data/yyyymmdd=20220202");
String namespace = "temp";
String tableName = "iceberg_test_table";
System.out.println("Creating table and writing..");
df.writeTo("hive." + namespace + "." + tableName)
.tableProperty("location", "/tmp/iceberg/" + tableName)
.tableProperty("write.format.default", "orc")
.createOrReplace();
System.out.println("Writing again using MERGE INTO");
df.createOrReplaceTempView("tmp_view");
String mergeIntoQuery = "MERGE INTO hive." + namespace + "." +
tableName + " AS table\n" +
"USING (SELECT * FROM tmp_view) AS updates\n" +
"ON table.keyA = updates.keyA\n" +
"WHEN MATCHED THEN UPDATE SET *\n" +
"WHEN NOT MATCHED THEN INSERT *";
spark.sql(mergeIntoQuery);
Dataset<Row> out = spark.sql("SELECT * FROM hive." + namespace + "."
+ tableName);
out.show();
}
}
```
The data file is just this
```json
{"keyA": "valueA", "keyB": "valueB"}
```
put inside `/tmp/data/yyyymmdd=20220202` folder.
Interestingly, the error disappears when we remove the `basePath` option
from spark.read.
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]