soumilshah1995 commented on issue #10392:
URL: https://github.com/apache/iceberg/issues/10392#issuecomment-3287238779
HI @aihuaxu
```
from pyspark.sql import SparkSession
import os
import sys
os.environ["JAVA_HOME"] =
"/opt/homebrew/Cellar/openjdk@17/17.0.14/libexec/openjdk.jdk/Contents/Home"
SUBMIT_ARGS = (
"--packages org.apache.iceberg:iceberg-spark-runtime-4.0_2.13:1.10.0 "
"--repositories https://repo1.maven.org/maven2 "
"pyspark-shell"
)
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS
os.environ["PYSPARK_PYTHON"] = sys.executable
local_warehouse_path = "/Users/soumilshah/Desktop/warehouse"
spark = (
SparkSession.builder
.config("spark.sql.extensions",
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions")
.config("spark.sql.catalog.dev", "org.apache.iceberg.spark.SparkCatalog")
.config("spark.sql.catalog.dev.type", "hadoop")
.config("spark.sql.catalog.dev.warehouse", local_warehouse_path)
.config("spark.sql.catalog.dev.format-version", "3")
.config("spark.sql.join.preferSortMergeJoin", "false")
.getOrCreate()
)
spark.sql("""
CREATE TABLE IF NOT EXISTS dev.default.test_table (
id BIGINT,
properties VARIANT
)
USING iceberg
TBLPROPERTIES (
'format-version' = '3'
)
""")
# Complex JSON string data
json_string = '''{
"user": {
"name": "Alice",
"age": 30,
"hobbies": ["reading", "swimming", "hiking"],
"address": {
"city": "Wonderland",
"zip": "12345"
}
},
"status": "active",
"tags": ["admin", "user", "editor"]
}'''
# Create DataFrame to insert
df = spark.createDataFrame([(1, json_string)], ["id", "properties"])
# Cast JSON string column to VARIANT type
df = df.withColumn("properties", col("properties").cast("variant"))
# Append data to Iceberg table
df.writeTo("dev.default.test_table").append()
# Query nested fields in VARIANT column
spark.sql("""
SELECT
id,
variant_get(properties, '$.user.name', 'string') as user_name,
variant_get(properties, '$.user.age', 'int') as user_age,
variant_get(properties, '$.user.address.city', 'string') as city,
variant_get(properties, '$.status', 'string') as status
FROM dev.default.test_table
""").show(truncate=False)
+---+---------+--------+----+------+
|id |user_name|user_age|city|status|
+---+---------+--------+----+------+
|1 |NULL |NULL |NULL|NULL |
|1 |NULL |NULL |NULL|NULL |
+---+---------+--------+----+------+
```
Any pointers on why I am seeing NULL on querying varient types
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]