singhpk234 commented on code in PR #1402: URL: https://github.com/apache/polaris/pull/1402#discussion_r2056566344
########## plugins/spark/v3.5/regtests/setup.sh: ########## @@ -0,0 +1,157 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# +# Idempotent setup for regression tests. Run manually or let run.sh auto-run. +# +# Warning - first time setup may download large amounts of files +# Warning - may clobber conf/spark-defaults.conf + +set -x + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) + +SPARK_VERSION=3.5.5 +SCALA_VERSION=2.12 +POLARIS_CLIENT_JAR="" +while [[ $# -gt 0 ]]; do + case "$1" in + --sparkVersion) + SPARK_VERSION="$2" + shift # past argument + shift # past value + ;; + --scalaVersion) + SCALA_VERSION="$2" + shift # past argument + shift # past value + ;; + --jar) + POLARIS_CLIENT_JAR="$2" + shift # past argument + shift # past value + ;; + --) shift; + break + ;; + esac +done + +echo "SET UP FOR SPARK_VERSION=${SPARK_VERSION} SCALA_VERSION=${SCALA_VERSION} POLARIS_CLIENT_JAR=${POLARIS_CLIENT_JAR}" + +if [ "$SCALA_VERSION" == "2.12" ]; then + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3 +else + SPARK_DISTRIBUTION=spark-${SPARK_VERSION}-bin-hadoop3-scala${SCALA_VERSION} +fi + +echo "Getting spark distribution ${SPARK_DISTRIBUTION}" + +if [ -z "${SPARK_HOME}" ]; then + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) +fi +SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +DERBY_HOME="/tmp/derby" + +# Ensure binaries are downloaded locally +echo 'Verifying Spark binaries...' +if ! [ -f ${SPARK_HOME}/bin/spark-sql ]; then + echo 'Setting up Spark...' + if [ -z "${SPARK_VERSION}" ] || [ -z "${SPARK_DISTRIBUTION}" ]; then + echo 'SPARK_VERSION or SPARK_DISTRIBUTION not set. Please set SPARK_VERSION and SPARK_DISTRIBUTION to the desired version.' + exit 1 + fi + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Downloading spark distro...' + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + if [[ "${OSTYPE}" == "darwin"* ]]; then + echo "Detected OS: mac. Running 'brew install wget' to try again." + brew install wget + wget -O ~/${SPARK_DISTRIBUTION}.tgz https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_DISTRIBUTION}.tgz + fi + fi + else + echo 'Found existing Spark tarball' + fi + # check if the download was successful + if ! [ -f ~/${SPARK_DISTRIBUTION}.tgz ]; then + echo 'Failed to download Spark distribution. Please check the logs.' + exit 1 + fi + tar xzvf ~/${SPARK_DISTRIBUTION}.tgz -C ~/${TEST_ROOT_DIR} + if [ $? -ne 0 ]; then + echo 'Failed to extract Spark distribution. Please check the logs.' + exit 1 + else + echo 'Extracted Spark distribution.' + rm ~/${SPARK_DISTRIBUTION}.tgz + fi + SPARK_HOME=$(realpath ~/${SPARK_DISTRIBUTION}) + SPARK_CONF="${SPARK_HOME}/conf/spark-defaults.conf" +else + echo 'Verified Spark distro already installed.' +fi + +echo "SPARK_HOME=${SPARK_HOME}" +echo "SPARK_CONF=${SPARK_CONF}" + +# Ensure Spark boilerplate conf is set +echo 'Verifying Spark conf...' +if grep 'POLARIS_TESTCONF_V5' ${SPARK_CONF} 2>/dev/null; then + echo 'Verified spark conf' +else + echo 'Setting spark conf...' + # Instead of clobbering existing spark conf, just comment it all out in case it was customized carefully. + sed -i 's/^/# /' ${SPARK_CONF} +cat << EOF >> ${SPARK_CONF} + +# POLARIS Spark client test conf +spark.jars $POLARIS_CLIENT_JAR +spark.jars.packages org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_${SCALA_VERSION}:3.2.1 +spark.hadoop.fs.s3.impl org.apache.hadoop.fs.s3a.S3AFileSystem +spark.hadoop.fs.AbstractFileSystem.s3.impl org.apache.hadoop.fs.s3a.S3A +spark.sql.variable.substitute true + +spark.driver.extraJavaOptions -Dderby.system.home=${DERBY_HOME} + +spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,io.delta.sql.DeltaSparkSessionExtension +spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog +spark.sql.catalog.polaris=org.apache.polaris.spark.SparkCatalog +spark.sql.catalog.polaris.type=rest +spark.sql.catalog.polaris.uri=http://${POLARIS_HOST:-localhost}:8181/api/catalog +spark.sql.catalog.polaris.header.X-Iceberg-Access-Delegation=vended-credentials +spark.sql.catalog.polaris.client.region=us-west-2 +spark.sql.sources.useV1SourceList='' Review Comment: > i don't think this is change parquet to v2, it just returns DeltaTableV2 It does, please ref default value of this config : https://github.com/apache/spark/blob/master/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala#L3882 > If I recall correctly, even with DeltaTableV2, some oeperations might still fall back to V1 Thats true, what do we do in that case fail ? Otherwise it can cause correctness issue. > there is no such utility for DataSourceV1 yet, if we run into problems in the near future for other table types like csv Sure, just we need to make sure we document DSv1 not supported and then let customer decide, Delta is trying to move to DSv2 but it's atleast a year from now. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected]
