Re: [I] Unable to run show command on dataframe on iceberg format data [iceberg]

via GitHub Tue, 15 Apr 2025 13:21:05 -0700


knowxyz commented on issue #12802:
URL: https://github.com/apache/iceberg/issues/12802#issuecomment-2807400456


   # Databricks notebook source
   from pyspark.sql import SparkSession
   # Azure Storage Account Details
   container_name = "csv"
   mount_point = "/mnt/icebergdata9" # Choose a mount point
   #mount_point= "dbfs:/FileStore/tables"
   storage_account_name = "uniceftesteu"
   storage_account_key = 
"zZHIbG0oPv4knWJqNG7CPHUjLQNkvfHP9fOy5UsK89igJsBetNpub7s9dp4rmf0l6JC4QO1ABBgU+AStlGv+3w=="
   from pyspark.sql import SparkSession
   # Azure Storage Account Details
   container_name = "csv"
   mount_point = "/mnt/icebergdata9" # Choose a mount point
   
   storage_account_name = "test"
   storage_account_key = "zZHIbG0BBgU+AStlGv+3w=="
   file_name = "data.csv"
   catalog = "sparkcatalog"
   store = "default"
   table = "sampleoutput37"
   warehouseName ="warehousenew10"
   icebergtable_name = catalog + "." + store + "." + table
   tempview ="tempviewice14"
   file_path = 
f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/{file_name}"
   try:
       dbutils.fs.unmount(mount_point)
       print(f"{mount_point} unmounted successfully.")
   except:
     print("{mount_point} was not mounted or unmounting failed.")
   try:
       dbutils.fs.mount(
           
source=f"wasbs://{container_name}@{storage_account_name}.blob.core.windows.net/",
           mount_point=mount_point,
           
extra_configs={f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net":
 storage_account_key}
       )
       print(f"Mounted {container_name} to {mount_point} using Access Key.")
   except Exception as e:
       print(f"Error mounting using Access Key: ")
   print(f"fs.azure.account.key  ready to excute")
   
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
 storage_account_key)
   print(f"fs.azure.account.key  Excuted")
   # Initialize Spark Session (if not already initialized)
   spark = SparkSession.builder \
       .appName("IcebergSave") \
       .config("spark.sql.extensions", 
"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") \
       .config("spark.sql.catalog.spark_catalog", 
"org.apache.iceberg.spark.SparkSessionCatalog") \
       .config("spark.sql.catalog.spark_catalog.type", "hive") \
       .config("spark.sql.catalog.local", 
"org.apache.iceberg.spark.SparkCatalog") \
       .config("spark.sql.catalog.local.type", "hadoop") \
       .config("spark.sql.catalog.local.warehouse", "hadoop") \
       .config("spark.sql.catalog.local.warehouse", 
f"{mount_point}/{warehouseName}") \
       .config("spark.hadoop.fs.abfs.impl", 
"org.apache.hadoop.fs.azurebfs.AzureBlobFileSystem") \
       .config("spark.hadoop.fs.AbstractFileSystem.abfs.impl", 
"org.apache.hadoop.fs.azurebfs.Abfs") \
       .config("spark.jars.packages", 
"org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.7.1, 
org.apache.iceberg:iceberg-azure-bundle:1.7.1").getOrCreate()
   
spark.conf.set(f"fs.azure.account.key.{storage_account_name}.blob.core.windows.net",
 storage_account_key)
   spark.conf.set("spark.sql.parquet.enableVectorizedReader", "false")
   spark.conf.set("spark.sql.iceberg.vectorization.enabled", "false")
   spark.conf.set("spark.databricks.io.cache.enabled","false")
   spark.conf.set("spark.sql.files.useFsCache", "false")
   spark.conf.set("spark.sql.execution.photon.enabled", "false")
   spark.conf.set("spark.sql.catalog.spark_catalog.io-impl", 
"org.apache.iceberg.azure.adlsv2.ADLSFileIO")
   
   
   
       
   
   
   # COMMAND ----------
   
   # MAGIC %sql
   # MAGIC CREATE TABLE local.db1.table2 (id bigint, data string) USING iceberg
   
   # COMMAND ----------
   
   # MAGIC %sql
   # MAGIC INSERT INTO local.db1.table2 VALUES (10, 'a'), (20, 'b'), (30, 'c');
   # MAGIC INSERT INTO local.db1.table2 VALUES (11, 'a'), (12, 'b'), (13, 'c');
   
   # COMMAND ----------
   
   # MAGIC %sql
   # MAGIC SELECT count(1) as count, data
   # MAGIC FROM local.db1.table2
   # MAGIC GROUP BY data;
   
   # COMMAND ----------
   
   df = spark.sql("select * from local.db1.table2")
   
   df.show()
   
   even this code also fails with same error


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org
For additional commands, e-mail: issues-h...@iceberg.apache.org

Re: [I] Unable to run show command on dataframe on iceberg format data [iceberg]

Reply via email to