thijsheijden commented on issue #13218: URL: https://github.com/apache/iceberg/issues/13218#issuecomment-2943420426
Sure, here is the entire piece of code: ``` import argparse import os import shutil import pyspark def main(): parser = argparse.ArgumentParser() parser.add_argument('-i', '--file_dir') # The data to process (Parquet files directory) parser.add_argument('-t', '--table') # The table name to output to args = parser.parse_args() conf = pyspark.SparkConf() conf.setMaster('local[*]') conf.set('spark.sql.catalog.iceberg_catalog', 'org.apache.iceberg.spark.SparkCatalog') conf.set('spark.sql.catalog.iceberg_catalog.type', 'hadoop') conf.set('spark.sql.catalog.iceberg_catalog.warehouse', 'data') conf.set('spark.sql.parquet.outputTimestampType', 'TIMESTAMP_MICROS') conf.set('spark.driver.memory', '10g') conf.set('spark.jars.packages', 'org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.9.1') conf.set('spark.sql.extensions', 'org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions') spark = pyspark.sql.SparkSession.builder.config(conf=conf).getOrCreate() sc = spark.sparkContext sc.setLogLevel("ERROR") spark.sql("USE iceberg_catalog") spark.sql("CREATE NAMESPACE IF NOT EXISTS default") spark.sql("USE NAMESPACE default") # Load the batches of files to import batches = os.listdir(args.file_dir) first_file = os.path.join(args.file_dir, "batch_0", os.listdir(os.path.join(args.file_dir, "batch_0"))[0]) # Create table using schema of the first file df = spark.read.parquet(first_file) empty_df = spark.createDataFrame([], df.schema) empty_df.writeTo(args.table).create() batch_idx = 1 for batch_dir in batches: print(f"Adding batch {batch_idx}") spark.sql(f""" CALL iceberg_catalog.system.add_files( table => 'default.{args.table}', source_table => '`parquet`.`{os.path.join(args.file_dir, batch_dir)}`', parallelism => 6 ) """) batch_idx += 1 main() ``` -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org