[spark] branch master updated: [SPARK-45340][SQL] Remove the SQL config `spark.sql.hive.verifyPartitionPath`

maxgekk Tue, 26 Sep 2023 22:29:42 -0700

This is an automated email from the ASF dual-hosted git repository.

maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git



The following commit(s) were added to refs/heads/master by this push:
     new eff46ea77e9 [SPARK-45340][SQL] Remove the SQL config 
`spark.sql.hive.verifyPartitionPath`
eff46ea77e9 is described below

commit eff46ea77e9bebe7777f3076277bef1e086833dd
Author: Max Gekk <[email protected]>
AuthorDate: Wed Sep 27 08:28:45 2023 +0300

    [SPARK-45340][SQL] Remove the SQL config 
`spark.sql.hive.verifyPartitionPath`
    
    ### What changes were proposed in this pull request?
    In the PR, I propose to remove already deprecated SQL config 
`spark.sql.hive.verifyPartitionPath`, and the code under the config. The config 
has been deprecated since Spark 3.0.
    
    ### Why are the changes needed?
    To improve code maintainability by remove unused code.
    
    ### Does this PR introduce _any_ user-facing change?
    No.
    
    ### How was this patch tested?
    By running the modified test suite:
    ```
    $ build/sbt "test:testOnly *SQLConfSuite"
    $ build/sbt "test:testOnly *QueryPartitionSuite"
    ```
    
    ### Was this patch authored or co-authored using generative AI tooling?
    No.
    
    Closes #43130 from MaxGekk/remove-verifyPartitionPath.
    
    Authored-by: Max Gekk <[email protected]>
    Signed-off-by: Max Gekk <[email protected]>
---
 .../org/apache/spark/sql/internal/SQLConf.scala    | 17 ++-------
 .../apache/spark/sql/internal/SQLConfSuite.scala   |  4 +--
 .../org/apache/spark/sql/hive/TableReader.scala    | 41 +---------------------
 .../spark/sql/hive/QueryPartitionSuite.scala       | 12 ++-----
 4 files changed, 8 insertions(+), 66 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 43eb0756d8d..aeef531dbcd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -34,7 +34,6 @@ import org.apache.hadoop.fs.Path
 import org.apache.spark.{ErrorMessageFormat, SparkConf, SparkContext, 
TaskContext}
 import org.apache.spark.internal.Logging
 import org.apache.spark.internal.config._
-import org.apache.spark.internal.config.{IGNORE_MISSING_FILES => 
SPARK_IGNORE_MISSING_FILES}
 import org.apache.spark.network.util.ByteUnit
 import org.apache.spark.sql.catalyst.ScalaReflection
 import org.apache.spark.sql.catalyst.analysis.{HintErrorLogger, Resolver}
@@ -1261,14 +1260,6 @@ object SQLConf {
     .booleanConf
     .createWithDefault(false)
 
-  val HIVE_VERIFY_PARTITION_PATH = 
buildConf("spark.sql.hive.verifyPartitionPath")
-    .doc("When true, check all the partition paths under the table\'s root 
directory " +
-         "when reading data stored in HDFS. This configuration will be 
deprecated in the future " +
-         s"releases and replaced by ${SPARK_IGNORE_MISSING_FILES.key}.")
-    .version("1.4.0")
-    .booleanConf
-    .createWithDefault(false)
-
   val HIVE_METASTORE_DROP_PARTITION_BY_NAME =
     buildConf("spark.sql.hive.dropPartitionByName.enabled")
       .doc("When true, Spark will get partition name rather than partition 
object " +
@@ -4472,8 +4463,6 @@ object SQLConf {
         PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key, "2.4",
         "The config allows to switch to the behaviour before Spark 2.4 " +
           "and will be removed in the future releases."),
-      DeprecatedConfig(HIVE_VERIFY_PARTITION_PATH.key, "3.0",
-        s"This config is replaced by '${SPARK_IGNORE_MISSING_FILES.key}'."),
       DeprecatedConfig(ARROW_EXECUTION_ENABLED.key, "3.0",
         s"Use '${ARROW_PYSPARK_EXECUTION_ENABLED.key}' instead of it."),
       DeprecatedConfig(ARROW_FALLBACK_ENABLED.key, "3.0",
@@ -4552,7 +4541,9 @@ object SQLConf {
       RemovedConfig("spark.sql.ansi.strictIndexOperator", "3.4.0", "true",
         "This was an internal configuration. It is not needed anymore since 
Spark SQL always " +
           "returns null when getting a map value with a non-existing key. See 
SPARK-40066 " +
-          "for more details.")
+          "for more details."),
+      RemovedConfig("spark.sql.hive.verifyPartitionPath", "4.0.0", "false",
+        s"This config was replaced by '${IGNORE_MISSING_FILES.key}'.")
     )
 
     Map(configs.map { cfg => cfg.key -> cfg } : _*)
@@ -4766,8 +4757,6 @@ class SQLConf extends Serializable with Logging with 
SqlApiConf {
 
   def isOrcSchemaMergingEnabled: Boolean = getConf(ORC_SCHEMA_MERGING_ENABLED)
 
-  def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
-
   def metastoreDropPartitionsByName: Boolean = 
getConf(HIVE_METASTORE_DROP_PARTITION_BY_NAME)
 
   def metastorePartitionPruning: Boolean = 
getConf(HIVE_METASTORE_PARTITION_PRUNING)
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 0a0bee2eabd..822c0642f2b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -422,9 +422,9 @@ class SQLConfSuite extends QueryTest with 
SharedSparkSession {
         e.getMessage.getFormattedMessage.contains(config)))
     }
 
-    val config1 = SQLConf.HIVE_VERIFY_PARTITION_PATH.key
+    val config1 = SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key
     withLogAppender(logAppender) {
-      spark.conf.set(config1, true)
+      spark.conf.set(config1, 1)
     }
     check(config1)
 
diff --git 
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala 
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 5bb982624b0..a61040b2fab 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -160,46 +160,7 @@ class HadoopTableReader(
   def makeRDDForPartitionedTable(
       partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]],
       filterOpt: Option[PathFilter]): RDD[InternalRow] = {
-
-    // SPARK-5068:get FileStatus and do the filtering locally when the path is 
not exists
-    def verifyPartitionPath(
-        partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]):
-        Map[HivePartition, Class[_ <: Deserializer]] = {
-      if (!conf.verifyPartitionPath) {
-        partitionToDeserializer
-      } else {
-        val existPathSet = collection.mutable.Set[String]()
-        val pathPatternSet = collection.mutable.Set[String]()
-        partitionToDeserializer.filter {
-          case (partition, partDeserializer) =>
-            def updateExistPathSetByPathPattern(pathPatternStr: String): Unit 
= {
-              val pathPattern = new Path(pathPatternStr)
-              val fs = pathPattern.getFileSystem(hadoopConf)
-              val matches = fs.globStatus(pathPattern)
-              matches.foreach(fileStatus => existPathSet += 
fileStatus.getPath.toString)
-            }
-            // convert  /demo/data/year/month/day  to  /demo/data/*/*/*/
-            def getPathPatternByPath(parNum: Int, tempPath: Path): String = {
-              var path = tempPath
-              for (i <- (1 to parNum)) path = path.getParent
-              val tails = (1 to parNum).map(_ => "*").mkString("/", "/", "/")
-              path.toString + tails
-            }
-
-            val partPath = partition.getDataLocation
-            val partNum = 
Utilities.getPartitionDesc(partition).getPartSpec.size()
-            val pathPatternStr = getPathPatternByPath(partNum, partPath)
-            if (!pathPatternSet.contains(pathPatternStr)) {
-              pathPatternSet += pathPatternStr
-              updateExistPathSetByPathPattern(pathPatternStr)
-            }
-            existPathSet.contains(partPath.toString)
-        }
-      }
-    }
-
-    val hivePartitionRDDs = verifyPartitionPath(partitionToDeserializer)
-      .map { case (partition, partDeserializer) =>
+    val hivePartitionRDDs = partitionToDeserializer.map { case (partition, 
partDeserializer) =>
       val partDesc = Utilities.getPartitionDescFromTableDesc(tableDesc, 
partition, true)
       val partPath = partition.getDataLocation
       val inputPathStr = applyFilterIfNeeded(partPath, filterOpt)
diff --git 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index f4fb18119fa..c1be1cee005 100644
--- 
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++ 
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -65,16 +65,8 @@ class QueryPartitionSuite extends QueryTest with 
SQLTestUtils with TestHiveSingl
     }
   }
 
-  test("SPARK-5068: query data when path doesn't exist") {
-    withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
-      queryWhenPathNotExist()
-    }
-  }
-
-  test("Replace spark.sql.hive.verifyPartitionPath by 
spark.files.ignoreMissingFiles") {
-    withSQLConf(
-      SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false",
-      SQLConf.IGNORE_MISSING_FILES.key -> "true") {
+  test("Replace spark.sql.hive.verifyPartitionPath by 
spark.sql.files.ignoreMissingFiles") {
+    withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") {
       queryWhenPathNotExist()
     }
   }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[spark] branch master updated: [SPARK-45340][SQL] Remove the SQL config `spark.sql.hive.verifyPartitionPath`

Reply via email to