This is an automated email from the ASF dual-hosted git repository.
wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new ffaeeb47633b [SPARK-50953][FOLLOW-UP] Improve path parsing in
variant_get
ffaeeb47633b is described below
commit ffaeeb47633b08fa45603ceef3256917579a90f7
Author: Harsh Motwani <[email protected]>
AuthorDate: Fri Mar 21 14:19:12 2025 +0800
[SPARK-50953][FOLLOW-UP] Improve path parsing in variant_get
### What changes were proposed in this pull request?
The variant_get paths currently do not allow '?' in keys or empty strings
as keys.
**Note:** Even after this fix, some set of paths are still prohibited -
particularly those containing both single and double quotes. For example, there
is no way to extract 1 from `{"\"sample string's\"" : 1}`. If you use the
double quotes notation, double quotes are prohibited. If you use the single
quotes notation, single quotes are prohibited. I tried to add support for
escaped double quotes in the path but it is not the same because parse_json
gets rid of the `\`.
### Why are the changes needed?
JSON strings with empty keys and keys containing '?' can be parsed as
variant using `parse_json`. However, there is currently no way to extract
values corresponding to these keys using variant_get. i.e.
`variant_get(parse_json('{"" : 1}'), '$[""]')` and
`variant_get(parse_json('{"?" : 1}'), '$["?"]')` fail.
### Does this PR introduce _any_ user-facing change?
Yes, users would now be able to extract a more diverse set of paths from a
variant.
### How was this patch tested?
Unit tests
### Was this patch authored or co-authored using generative AI tooling?
No
Closes #50342 from harshmotw-db/harsh-motwani_data/variant_get_fix.
Authored-by: Harsh Motwani <[email protected]>
Signed-off-by: Wenchen Fan <[email protected]>
(cherry picked from commit 59b0fca69d31ee57b5c3b8b0df7757a7c165b4c9)
Signed-off-by: Wenchen Fan <[email protected]>
---
.../sql/catalyst/expressions/variant/variantExpressions.scala | 4 ++--
.../catalyst/expressions/variant/VariantExpressionSuite.scala | 9 ++++++++-
2 files changed, 10 insertions(+), 3 deletions(-)
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
index fcd760561f90..027acc64d73f 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
@@ -206,8 +206,8 @@ object VariantPathParser extends RegexParsers {
// Parse key segment like `.name`, `['name']`, or `["name"]`.
private def key: Parser[VariantPathSegment] =
for {
- key <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']" |
- "[\"" ~> "[^\\\"\\?]+".r <~ "\"]"
+ key <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^']*".r <~ "']" |
+ "[\"" ~> """[^"]*""".r <~ "\"]"
} yield {
ObjectExtraction(key)
}
diff --git
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
index 117436a02393..df816bd8165f 100644
---
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
+++
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
@@ -455,7 +455,7 @@ class VariantExpressionSuite extends SparkFunSuite with
ExpressionEvalHelper {
|"category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],
|"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}},
|"email":"amy@only_for_json_udf_test.net","owner":"amy","zip
code":"94025",
- |"fb:testid":"1234"}
+ |"fb:testid":"1234","":"empty string","?":"Question Mark?"}
|""".stripMargin
testVariantGet(json, "$.store.bicycle", StringType,
"""{"color":"red","price":19.95}""")
checkEvaluation(
@@ -469,6 +469,10 @@ class VariantExpressionSuite extends SparkFunSuite with
ExpressionEvalHelper {
)
testVariantGet(json, "$.store.bicycle.color", StringType, "red")
testVariantGet(json, "$.store.bicycle.price", DoubleType, 19.95)
+ testVariantGet(json, "$[\"\"]", StringType, "empty string")
+ testVariantGet(json, "$['']", StringType, "empty string")
+ testVariantGet(json, "$[\"?\"]", StringType, "Question Mark?")
+ testVariantGet(json, "$['?']", StringType, "Question Mark?")
testVariantGet(
json,
"$.store.book",
@@ -678,6 +682,9 @@ class VariantExpressionSuite extends SparkFunSuite with
ExpressionEvalHelper {
checkInvalidPath("$1")
checkInvalidPath("$[-1]")
checkInvalidPath("""$['"]""")
+
+ checkInvalidPath("$[\"\"\"]")
+ checkInvalidPath("$[\"\\\"\"]")
}
test("cast from variant") {
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]