This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/branch-4.0 by this push:
     new ffaeeb47633b [SPARK-50953][FOLLOW-UP] Improve path parsing in 
variant_get
ffaeeb47633b is described below

commit ffaeeb47633b08fa45603ceef3256917579a90f7
Author: Harsh Motwani <[email protected]>
AuthorDate: Fri Mar 21 14:19:12 2025 +0800

    [SPARK-50953][FOLLOW-UP] Improve path parsing in variant_get
    
    ### What changes were proposed in this pull request?
    
    The variant_get paths currently do not allow '?' in keys or empty strings 
as keys.
    
    **Note:** Even after this fix, some set of paths are still prohibited - 
particularly those containing both single and double quotes. For example, there 
is no way to extract 1 from `{"\"sample string's\"" : 1}`. If you use the 
double quotes notation, double quotes are prohibited. If you use the single 
quotes notation, single quotes are prohibited. I tried to add support for 
escaped double quotes in the path but it is not the same because parse_json 
gets rid of the `\`.
    
    ### Why are the changes needed?
    
    JSON strings with empty keys and keys containing '?' can be parsed as 
variant using `parse_json`. However, there is currently no way to extract 
values corresponding to these keys using variant_get. i.e. 
`variant_get(parse_json('{"" : 1}'), '$[""]')` and 
`variant_get(parse_json('{"?" : 1}'), '$["?"]')` fail.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, users would now be able to extract a more diverse set of paths from a 
variant.
    
    ### How was this patch tested?
    
    Unit tests
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #50342 from harshmotw-db/harsh-motwani_data/variant_get_fix.
    
    Authored-by: Harsh Motwani <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
    (cherry picked from commit 59b0fca69d31ee57b5c3b8b0df7757a7c165b4c9)
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../sql/catalyst/expressions/variant/variantExpressions.scala    | 4 ++--
 .../catalyst/expressions/variant/VariantExpressionSuite.scala    | 9 ++++++++-
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
index fcd760561f90..027acc64d73f 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/variant/variantExpressions.scala
@@ -206,8 +206,8 @@ object VariantPathParser extends RegexParsers {
   // Parse key segment like `.name`, `['name']`, or `["name"]`.
   private def key: Parser[VariantPathSegment] =
     for {
-      key <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^\\'\\?]+".r <~ "']" |
-        "[\"" ~> "[^\\\"\\?]+".r <~ "\"]"
+      key <- '.' ~> "[^\\.\\[]+".r | "['" ~> "[^']*".r <~ "']" |
+        "[\"" ~> """[^"]*""".r <~ "\"]"
     } yield {
       ObjectExtraction(key)
     }
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
index 117436a02393..df816bd8165f 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/variant/VariantExpressionSuite.scala
@@ -455,7 +455,7 @@ class VariantExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
         
|"category":"fiction","reader":[{"age":25,"name":"bob"},{"age":26,"name":"jack"}],
         
|"price":22.99,"isbn":"0-395-19395-8"}],"bicycle":{"price":19.95,"color":"red"}},
         |"email":"amy@only_for_json_udf_test.net","owner":"amy","zip 
code":"94025",
-        |"fb:testid":"1234"}
+        |"fb:testid":"1234","":"empty string","?":"Question Mark?"}
         |""".stripMargin
     testVariantGet(json, "$.store.bicycle", StringType, 
"""{"color":"red","price":19.95}""")
     checkEvaluation(
@@ -469,6 +469,10 @@ class VariantExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     )
     testVariantGet(json, "$.store.bicycle.color", StringType, "red")
     testVariantGet(json, "$.store.bicycle.price", DoubleType, 19.95)
+    testVariantGet(json, "$[\"\"]", StringType, "empty string")
+    testVariantGet(json, "$['']", StringType, "empty string")
+    testVariantGet(json, "$[\"?\"]", StringType, "Question Mark?")
+    testVariantGet(json, "$['?']", StringType, "Question Mark?")
     testVariantGet(
       json,
       "$.store.book",
@@ -678,6 +682,9 @@ class VariantExpressionSuite extends SparkFunSuite with 
ExpressionEvalHelper {
     checkInvalidPath("$1")
     checkInvalidPath("$[-1]")
     checkInvalidPath("""$['"]""")
+
+    checkInvalidPath("$[\"\"\"]")
+    checkInvalidPath("$[\"\\\"\"]")
   }
 
   test("cast from variant") {


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to