This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new eff46ea77e9 [SPARK-45340][SQL] Remove the SQL config
`spark.sql.hive.verifyPartitionPath`
eff46ea77e9 is described below
commit eff46ea77e9bebe7777f3076277bef1e086833dd
Author: Max Gekk <[email protected]>
AuthorDate: Wed Sep 27 08:28:45 2023 +0300
[SPARK-45340][SQL] Remove the SQL config
`spark.sql.hive.verifyPartitionPath`
### What changes were proposed in this pull request?
In the PR, I propose to remove already deprecated SQL config
`spark.sql.hive.verifyPartitionPath`, and the code under the config. The config
has been deprecated since Spark 3.0.
### Why are the changes needed?
To improve code maintainability by remove unused code.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
By running the modified test suite:
```
$ build/sbt "test:testOnly *SQLConfSuite"
$ build/sbt "test:testOnly *QueryPartitionSuite"
```
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43130 from MaxGekk/remove-verifyPartitionPath.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
.../org/apache/spark/sql/internal/SQLConf.scala | 17 ++-------
.../apache/spark/sql/internal/SQLConfSuite.scala | 4 +--
.../org/apache/spark/sql/hive/TableReader.scala | 41 +---------------------
.../spark/sql/hive/QueryPartitionSuite.scala | 12 ++-----
4 files changed, 8 insertions(+), 66 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
index 43eb0756d8d..aeef531dbcd 100644
--- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
+++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -34,7 +34,6 @@ import org.apache.hadoop.fs.Path
import org.apache.spark.{ErrorMessageFormat, SparkConf, SparkContext,
TaskContext}
import org.apache.spark.internal.Logging
import org.apache.spark.internal.config._
-import org.apache.spark.internal.config.{IGNORE_MISSING_FILES =>
SPARK_IGNORE_MISSING_FILES}
import org.apache.spark.network.util.ByteUnit
import org.apache.spark.sql.catalyst.ScalaReflection
import org.apache.spark.sql.catalyst.analysis.{HintErrorLogger, Resolver}
@@ -1261,14 +1260,6 @@ object SQLConf {
.booleanConf
.createWithDefault(false)
- val HIVE_VERIFY_PARTITION_PATH =
buildConf("spark.sql.hive.verifyPartitionPath")
- .doc("When true, check all the partition paths under the table\'s root
directory " +
- "when reading data stored in HDFS. This configuration will be
deprecated in the future " +
- s"releases and replaced by ${SPARK_IGNORE_MISSING_FILES.key}.")
- .version("1.4.0")
- .booleanConf
- .createWithDefault(false)
-
val HIVE_METASTORE_DROP_PARTITION_BY_NAME =
buildConf("spark.sql.hive.dropPartitionByName.enabled")
.doc("When true, Spark will get partition name rather than partition
object " +
@@ -4472,8 +4463,6 @@ object SQLConf {
PANDAS_GROUPED_MAP_ASSIGN_COLUMNS_BY_NAME.key, "2.4",
"The config allows to switch to the behaviour before Spark 2.4 " +
"and will be removed in the future releases."),
- DeprecatedConfig(HIVE_VERIFY_PARTITION_PATH.key, "3.0",
- s"This config is replaced by '${SPARK_IGNORE_MISSING_FILES.key}'."),
DeprecatedConfig(ARROW_EXECUTION_ENABLED.key, "3.0",
s"Use '${ARROW_PYSPARK_EXECUTION_ENABLED.key}' instead of it."),
DeprecatedConfig(ARROW_FALLBACK_ENABLED.key, "3.0",
@@ -4552,7 +4541,9 @@ object SQLConf {
RemovedConfig("spark.sql.ansi.strictIndexOperator", "3.4.0", "true",
"This was an internal configuration. It is not needed anymore since
Spark SQL always " +
"returns null when getting a map value with a non-existing key. See
SPARK-40066 " +
- "for more details.")
+ "for more details."),
+ RemovedConfig("spark.sql.hive.verifyPartitionPath", "4.0.0", "false",
+ s"This config was replaced by '${IGNORE_MISSING_FILES.key}'.")
)
Map(configs.map { cfg => cfg.key -> cfg } : _*)
@@ -4766,8 +4757,6 @@ class SQLConf extends Serializable with Logging with
SqlApiConf {
def isOrcSchemaMergingEnabled: Boolean = getConf(ORC_SCHEMA_MERGING_ENABLED)
- def verifyPartitionPath: Boolean = getConf(HIVE_VERIFY_PARTITION_PATH)
-
def metastoreDropPartitionsByName: Boolean =
getConf(HIVE_METASTORE_DROP_PARTITION_BY_NAME)
def metastorePartitionPruning: Boolean =
getConf(HIVE_METASTORE_PARTITION_PRUNING)
diff --git
a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
index 0a0bee2eabd..822c0642f2b 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/internal/SQLConfSuite.scala
@@ -422,9 +422,9 @@ class SQLConfSuite extends QueryTest with
SharedSparkSession {
e.getMessage.getFormattedMessage.contains(config)))
}
- val config1 = SQLConf.HIVE_VERIFY_PARTITION_PATH.key
+ val config1 = SQLConf.COALESCE_PARTITIONS_MIN_PARTITION_NUM.key
withLogAppender(logAppender) {
- spark.conf.set(config1, true)
+ spark.conf.set(config1, 1)
}
check(config1)
diff --git
a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
index 5bb982624b0..a61040b2fab 100644
--- a/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
+++ b/sql/hive/src/main/scala/org/apache/spark/sql/hive/TableReader.scala
@@ -160,46 +160,7 @@ class HadoopTableReader(
def makeRDDForPartitionedTable(
partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]],
filterOpt: Option[PathFilter]): RDD[InternalRow] = {
-
- // SPARK-5068:get FileStatus and do the filtering locally when the path is
not exists
- def verifyPartitionPath(
- partitionToDeserializer: Map[HivePartition, Class[_ <: Deserializer]]):
- Map[HivePartition, Class[_ <: Deserializer]] = {
- if (!conf.verifyPartitionPath) {
- partitionToDeserializer
- } else {
- val existPathSet = collection.mutable.Set[String]()
- val pathPatternSet = collection.mutable.Set[String]()
- partitionToDeserializer.filter {
- case (partition, partDeserializer) =>
- def updateExistPathSetByPathPattern(pathPatternStr: String): Unit
= {
- val pathPattern = new Path(pathPatternStr)
- val fs = pathPattern.getFileSystem(hadoopConf)
- val matches = fs.globStatus(pathPattern)
- matches.foreach(fileStatus => existPathSet +=
fileStatus.getPath.toString)
- }
- // convert /demo/data/year/month/day to /demo/data/*/*/*/
- def getPathPatternByPath(parNum: Int, tempPath: Path): String = {
- var path = tempPath
- for (i <- (1 to parNum)) path = path.getParent
- val tails = (1 to parNum).map(_ => "*").mkString("/", "/", "/")
- path.toString + tails
- }
-
- val partPath = partition.getDataLocation
- val partNum =
Utilities.getPartitionDesc(partition).getPartSpec.size()
- val pathPatternStr = getPathPatternByPath(partNum, partPath)
- if (!pathPatternSet.contains(pathPatternStr)) {
- pathPatternSet += pathPatternStr
- updateExistPathSetByPathPattern(pathPatternStr)
- }
- existPathSet.contains(partPath.toString)
- }
- }
- }
-
- val hivePartitionRDDs = verifyPartitionPath(partitionToDeserializer)
- .map { case (partition, partDeserializer) =>
+ val hivePartitionRDDs = partitionToDeserializer.map { case (partition,
partDeserializer) =>
val partDesc = Utilities.getPartitionDescFromTableDesc(tableDesc,
partition, true)
val partPath = partition.getDataLocation
val inputPathStr = applyFilterIfNeeded(partPath, filterOpt)
diff --git
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
index f4fb18119fa..c1be1cee005 100644
---
a/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
+++
b/sql/hive/src/test/scala/org/apache/spark/sql/hive/QueryPartitionSuite.scala
@@ -65,16 +65,8 @@ class QueryPartitionSuite extends QueryTest with
SQLTestUtils with TestHiveSingl
}
}
- test("SPARK-5068: query data when path doesn't exist") {
- withSQLConf(SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "true") {
- queryWhenPathNotExist()
- }
- }
-
- test("Replace spark.sql.hive.verifyPartitionPath by
spark.files.ignoreMissingFiles") {
- withSQLConf(
- SQLConf.HIVE_VERIFY_PARTITION_PATH.key -> "false",
- SQLConf.IGNORE_MISSING_FILES.key -> "true") {
+ test("Replace spark.sql.hive.verifyPartitionPath by
spark.sql.files.ignoreMissingFiles") {
+ withSQLConf(SQLConf.IGNORE_MISSING_FILES.key -> "true") {
queryWhenPathNotExist()
}
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]