This is an automated email from the ASF dual-hosted git repository.
ashrigondekar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new 23c006c3abdb [SPARK-54156][PROTOBUF] Classify errors for
ProtobufOptions casting failure
23c006c3abdb is described below
commit 23c006c3abdbefcaed8952304728f298e94d7c2d
Author: nyaapa <[email protected]>
AuthorDate: Wed Nov 5 09:53:05 2025 -0800
[SPARK-54156][PROTOBUF] Classify errors for ProtobufOptions casting failure
### What changes were proposed in this pull request?
When an option of ProtobufOptions requires boolean/int value, but a value
that cannot be casted to boolean/int is passed in, classify the error.
### Why are the changes needed?
The error should be classified to have better user experience.
### Does this PR introduce any user-facing change?
No.
### How was this patch tested?
Added a unit test.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #52862 from nyaapa/SPARK-54156.
Authored-by: nyaapa <[email protected]>
Signed-off-by: Anish Shrigondekar <[email protected]>
---
.../spark/sql/protobuf/utils/ProtobufOptions.scala | 50 ++++++++++++++++++----
.../sql/protobuf/ProtobufFunctionsSuite.scala | 37 ++++++++++++++++
2 files changed, 78 insertions(+), 9 deletions(-)
diff --git
a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
index e85097a272f2..9a54b06a9d7f 100644
---
a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
+++
b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala
@@ -19,7 +19,7 @@ package org.apache.spark.sql.protobuf.utils
import org.apache.hadoop.conf.Configuration
import org.apache.spark.internal.Logging
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{AnalysisException, SparkSession}
import org.apache.spark.sql.catalyst.FileSourceOptions
import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, FailFastMode,
ParseMode}
@@ -34,6 +34,39 @@ private[sql] class ProtobufOptions(
import ProtobufOptions._
+ private def parseOption[T](
+ optionName: String,
+ value: String,
+ converter: String => T,
+ typeName: String): T = {
+ try {
+ converter(value)
+ } catch {
+ case _: IllegalArgumentException | _: NumberFormatException =>
+ throw new AnalysisException(
+ errorClass = "STDS_INVALID_OPTION_VALUE.WITH_MESSAGE",
+ messageParameters = Map(
+ "optionName" -> optionName,
+ "message" -> s"Cannot cast value '$value' to $typeName."
+ )
+ )
+ }
+ }
+
+ private def getBoolean(optionName: String, defaultValue: => Boolean):
Boolean = {
+ parameters
+ .get(optionName)
+ .map(v => parseOption(optionName, v, _.toBoolean, "Boolean"))
+ .getOrElse(defaultValue)
+ }
+
+ private def getInt(optionName: String, defaultValue: => Int): Int = {
+ parameters
+ .get(optionName)
+ .map(v => parseOption(optionName, v, _.toInt, "Int"))
+ .getOrElse(defaultValue)
+ }
+
def this(parameters: Map[String, String], conf: Configuration) = {
this(CaseInsensitiveMap(parameters), conf)
}
@@ -56,7 +89,7 @@ private[sql] class ProtobufOptions(
* 3: `struct<name: string, friend: struct<name: string, friend:
struct<name: string>>>`
* and so on.
*/
- val recursiveFieldMaxDepth: Int =
parameters.getOrElse("recursive.fields.max.depth", "-1").toInt
+ val recursiveFieldMaxDepth: Int = getInt("recursive.fields.max.depth",
defaultValue = -1)
/**
* This option ("convert.any.fields.to.json") enables converting Protobuf
'Any' fields to JSON.
@@ -104,7 +137,7 @@ private[sql] class ProtobufOptions(
* In addition schema safety is also reduced making downstream processing
error prone.
*/
val convertAnyFieldsToJson: Boolean =
- parameters.getOrElse(CONVERT_ANY_FIELDS_TO_JSON_CONFIG, "false").toBoolean
+ getBoolean(CONVERT_ANY_FIELDS_TO_JSON_CONFIG, defaultValue = false)
// Whether to render fields with zero values when deserializing Protobuf to
a Spark struct.
// When a field is empty in the serialized Protobuf, this library will
deserialize them as
@@ -139,8 +172,7 @@ private[sql] class ProtobufOptions(
// type-specific defaults.
// Ref: https://protobuf.dev/programming-guides/field_presence/ for
information about
// what information is available in a serialized proto.
- val emitDefaultValues: Boolean =
- parameters.getOrElse("emit.default.values", false.toString).toBoolean
+ val emitDefaultValues: Boolean = getBoolean("emit.default.values",
defaultValue = false)
// Whether to render enum fields as their integer values.
//
@@ -167,7 +199,7 @@ private[sql] class ProtobufOptions(
// Please note the output struct type will now contain an int column
// instead of string, so use caution if changing existing parsing logic.
val enumsAsInts: Boolean =
- parameters.getOrElse("enums.as.ints", false.toString).toBoolean
+ getBoolean("enums.as.ints", defaultValue = false)
// Protobuf supports unsigned integer types uint32 and uint64. By default
this library
// will serialize them as the signed IntegerType and LongType respectively.
For very
@@ -179,7 +211,7 @@ private[sql] class ProtobufOptions(
// i.e. LongType for uint32 and Decimal(20, 0) for uint64 so their
representation
// can contain large unsigned values without overflow.
val upcastUnsignedInts: Boolean =
- parameters.getOrElse("upcast.unsigned.ints", false.toString).toBoolean
+ getBoolean("upcast.unsigned.ints", defaultValue = false)
// Whether to unwrap the struct representation for well known primitive
wrapper types when
// deserializing. By default, the wrapper types for primitives (i.e.
google.protobuf.Int32Value,
@@ -206,7 +238,7 @@ private[sql] class ProtobufOptions(
// Concretely, the behavior with emit defaults and this option set is:
// nil => nil, Int32Value(0) => 0, Int32Value(100) => 100.
val unwrapWellKnownTypes: Boolean =
- parameters.getOrElse("unwrap.primitive.wrapper.types",
false.toString).toBoolean
+ getBoolean("unwrap.primitive.wrapper.types", defaultValue = false)
// Since Spark doesn't allow writing empty StructType, empty proto message
type will be
// dropped by default. Setting this option to true will insert a dummy
column to empty proto
@@ -223,7 +255,7 @@ private[sql] class ProtobufOptions(
// If retain.empty.message.types=true, field a will be retained by inserting
a dummy column.
// b struct<a: struct<__dummy_field_in_empty_struct: string>, name: string>
val retainEmptyMessage: Boolean =
- parameters.getOrElse("retain.empty.message.types",
false.toString).toBoolean
+ getBoolean("retain.empty.message.types", defaultValue = false)
}
private[sql] object ProtobufOptions {
diff --git
a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala
b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala
index c90ca239c1d4..e46fcb1a1735 100644
---
a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala
+++
b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala
@@ -2232,6 +2232,43 @@ class ProtobufFunctionsSuite extends QueryTest with
SharedSparkSession with Prot
}
}
+ test("SPARK-54156: boolean Protobuf options reject non-boolean values") {
+ Seq(
+ "emit.default.values",
+ "enums.as.ints",
+ "upcast.unsigned.ints",
+ "unwrap.primitive.wrapper.types",
+ "retain.empty.message.types",
+ "convert.any.fields.to.json"
+ ).foreach { opt =>
+ val e = intercept[AnalysisException] {
+ ProtobufOptions(Map(opt -> "not_a_bool"))
+ }
+ checkError(
+ exception = e,
+ condition = "STDS_INVALID_OPTION_VALUE.WITH_MESSAGE",
+ parameters = Map(
+ "optionName" -> opt,
+ "message" -> "Cannot cast value 'not_a_bool' to Boolean."
+ )
+ )
+ }
+ }
+
+ test("SPARK-54156: integer Protobuf options reject non-integer values") {
+ val e = intercept[AnalysisException] {
+ ProtobufOptions(Map("recursive.fields.max.depth" -> "not_an_int"))
+ }
+ checkError(
+ exception = e,
+ condition = "STDS_INVALID_OPTION_VALUE.WITH_MESSAGE",
+ parameters = Map(
+ "optionName" -> "recursive.fields.max.depth",
+ "message" -> "Cannot cast value 'not_an_int' to Int."
+ )
+ )
+ }
+
def testFromProtobufWithOptions(
df: DataFrame,
expectedDf: DataFrame,
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]