This is an automated email from the ASF dual-hosted git repository.

wenchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git


The following commit(s) were added to refs/heads/master by this push:
     new 9c675099c9bb [SPARK-54634][SQL] Add clear error message for empty IN 
predicate
9c675099c9bb is described below

commit 9c675099c9bbe1076fed6564738486bb7742ebe6
Author: sahilkumarsingh <[email protected]>
AuthorDate: Thu Jan 8 15:50:13 2026 +0800

    [SPARK-54634][SQL] Add clear error message for empty IN predicate
    
    ### What changes were proposed in this pull request?
    
    This PR will address the issue 
[SPARK-54634](https://issues.apache.org/jira/browse/SPARK-54634).
    
    With this, I am adding a user-friendly error message when users write SQL 
queries with an empty IN clause, like: _SELECT * FROM table WHERE col IN ()_
    
    ### Why are the changes needed?
    
    When users write SQL with an empty IN clause, Spark currently produces a 
syntax error of subclass [PARSE_SYNTAX_ERROR], which leads the user to believe 
that their syntax is incorrect, whereas the actual issue is due to the absence 
of values for the IN clause. Hence, the current error message does not 
communicate the right intention to the user.
    
    This change provides a clear, actionable error message that explains the 
actual problem
    and suggests alternatives.
    
    Example - Before:
    ```
    org.apache.spark.sql.catalyst.parser.ParseException:
    [PARSE_SYNTAX_ERROR] Syntax error at or near 'IN'. SQLSTATE: 42601 (line 1, 
pos 33)
    ```
    
    Example - After:
    ```
    org.apache.spark.sql.catalyst.parser.ParseException:
    [INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE] Invalid SQL syntax: IN predicate 
requires at least one value. Empty IN clauses like 'IN ()' are not allowed. 
Consider using 'WHERE FALSE' if you need an always-false condition, or provide 
at least one value in the IN list. SQLSTATE: 42000
    ```
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes, users will now see a better error message.
    
    Code executed: `spark.sql("SELECT * FROM range(10) WHERE id IN ()").show()`
    
    Before output:
    <img width="1080" height="309" alt="image" 
src="https://github.com/user-attachments/assets/1c6503e5-9e9c-4380-a9c0-87c9122417e6";
 />
    
    After output:
    <img width="1080" height="342" alt="image" 
src="https://github.com/user-attachments/assets/36169839-1ff4-4743-8666-cf87cfa622b5";
 />
    
    ### How was this patch tested?
    
    - I have added unit tests in QueryParsingErrorsSuite.scala and SQL golden 
tests added in predicate-functions.sql
    - I have also tested the build locally by running the query in spark-shell
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    Generated-by: Claude (Anthropic) - used for code assistance, test 
generation, and documentation.
    
    Closes #53390 from sahilkumarsingh/SPARK-54634-empty-in-predicate-error.
    
    Authored-by: sahilkumarsingh <[email protected]>
    Signed-off-by: Wenchen Fan <[email protected]>
---
 .../src/main/resources/error/error-conditions.json |  5 +++
 .../spark/sql/catalyst/parser/SqlBaseParser.g4     |  2 +-
 .../spark/sql/errors/QueryParsingErrors.scala      |  7 ++++
 .../spark/sql/catalyst/parser/AstBuilder.scala     |  4 +++
 .../sql/catalyst/parser/PlanParserSuite.scala      |  2 +-
 .../analyzer-results/predicate-functions.sql.out   | 34 +++++++++++++++++++
 .../sql-tests/inputs/predicate-functions.sql       |  4 +++
 .../sql-tests/results/predicate-functions.sql.out  | 36 ++++++++++++++++++++
 .../spark/sql/errors/QueryParsingErrorsSuite.scala | 38 ++++++++++++++++++++++
 9 files changed, 130 insertions(+), 2 deletions(-)

diff --git a/common/utils/src/main/resources/error/error-conditions.json 
b/common/utils/src/main/resources/error/error-conditions.json
index a6cf280fe5f9..968d3678008f 100644
--- a/common/utils/src/main/resources/error/error-conditions.json
+++ b/common/utils/src/main/resources/error/error-conditions.json
@@ -3920,6 +3920,11 @@
           "CREATE TEMPORARY TABLE ... USING ... is a deprecated syntax. To 
overcome the issue, please use CREATE TEMPORARY VIEW instead."
         ]
       },
+      "EMPTY_IN_PREDICATE" : {
+        "message" : [
+          "IN predicate requires at least one value. Empty IN clauses like 'IN 
()' are not allowed. Consider using 'WHERE FALSE' if you need an always-false 
condition, or provide at least one value in the IN list."
+        ]
+      },
       "EMPTY_PARTITION_VALUE" : {
         "message" : [
           "Partition key <partKey> must set value."
diff --git 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
index 21b8781f43ce..05662cd0beef 100644
--- 
a/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
+++ 
b/sql/api/src/main/antlr4/org/apache/spark/sql/catalyst/parser/SqlBaseParser.g4
@@ -1214,7 +1214,7 @@ booleanExpression
 
 predicate
     : errorCapturingNot? kind=BETWEEN lower=valueExpression AND 
upper=valueExpression
-    | errorCapturingNot? kind=IN LEFT_PAREN expression (COMMA expression)* 
RIGHT_PAREN
+    | errorCapturingNot? kind=IN (LEFT_PAREN RIGHT_PAREN | LEFT_PAREN 
expression (COMMA expression)* RIGHT_PAREN)
     | errorCapturingNot? kind=IN LEFT_PAREN query RIGHT_PAREN
     | errorCapturingNot? kind=RLIKE pattern=valueExpression
     | errorCapturingNot? kind=(LIKE | ILIKE) quantifier=(ANY | SOME | ALL) 
(LEFT_PAREN RIGHT_PAREN | LEFT_PAREN expression (COMMA expression)* RIGHT_PAREN)
diff --git 
a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala 
b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala
index cd302ef6fcca..e370010564b1 100644
--- 
a/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala
+++ 
b/sql/api/src/main/scala/org/apache/spark/sql/errors/QueryParsingErrors.scala
@@ -830,6 +830,13 @@ private[sql] object QueryParsingErrors extends 
DataTypeErrorsBase {
       ctx)
   }
 
+  def emptyInPredicateError(ctx: ParserRuleContext): Throwable = {
+    new ParseException(
+      errorClass = "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+      messageParameters = Map.empty,
+      ctx)
+  }
+
   /**
    * Throws an internal error for unexpected parameter markers found during 
AST building. This
    * should be unreachable in normal operation due to grammar-level blocking.
diff --git 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
index 578f0fffe72e..780a06060341 100644
--- 
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
+++ 
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/parser/AstBuilder.scala
@@ -2841,6 +2841,10 @@ class AstBuilder extends DataTypeAstBuilder
       case SqlBaseParser.IN if ctx.query != null =>
         invertIfNotDefined(InSubquery(getValueExpressions(e), 
ListQuery(plan(ctx.query))))
       case SqlBaseParser.IN =>
+        // Validate that IN clause is not empty
+        if (ctx.expression.isEmpty) {
+          throw QueryParsingErrors.emptyInPredicateError(ctx)
+        }
         invertIfNotDefined(In(e, ctx.expression.asScala.map(expression).toSeq))
       case SqlBaseParser.LIKE | SqlBaseParser.ILIKE =>
         Option(ctx.quantifier).map(_.getType) match {
diff --git 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
index 59205f308471..0128a3f090c5 100644
--- 
a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
+++ 
b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/parser/PlanParserSuite.scala
@@ -1395,7 +1395,7 @@ class PlanParserSuite extends AnalysisTest {
     checkError(
       exception = parseException(sql2),
       condition = "PARSE_SYNTAX_ERROR",
-      parameters = Map("error" -> "'IN'", "hint" -> ""))
+      parameters = Map("error" -> "'INTO'", "hint" -> ""))
   }
 
   test("relation in v2 catalog") {
diff --git 
a/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out
 
b/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out
index 55822a10041f..d894ce26af49 100644
--- 
a/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out
+++ 
b/sql/core/src/test/resources/sql-tests/analyzer-results/predicate-functions.sql.out
@@ -517,6 +517,40 @@ Project [NOT cast(null as int) IN (cast(1 as int),cast(2 
as int),cast(null as in
 +- OneRowRelation
 
 
+-- !query
+select 1 in ()
+-- !query analysis
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+  "sqlState" : "42000",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 10,
+    "stopIndex" : 14,
+    "fragment" : "in ()"
+  } ]
+}
+
+
+-- !query
+select 1 not in ()
+-- !query analysis
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+  "sqlState" : "42000",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 10,
+    "stopIndex" : 18,
+    "fragment" : "not in ()"
+  } ]
+}
+
+
 -- !query
 select 1 between 0 and 2
 -- !query analysis
diff --git 
a/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql 
b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
index 195db17a3a1f..314ac205b7dd 100644
--- a/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
+++ b/sql/core/src/test/resources/sql-tests/inputs/predicate-functions.sql
@@ -94,6 +94,10 @@ select 1 not in ('2', '3', '4', null);
 select null not in (1, 2, 3);
 select null not in (1, 2, null);
 
+-- Empty IN clause (negative case - should error)
+select 1 in ();
+select 1 not in ();
+
 -- Between
 select 1 between 0 and 2;
 select 0.5 between 0 and 1;
diff --git 
a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out 
b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
index 10ff268a5606..dd87b536d1d6 100644
--- a/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
+++ b/sql/core/src/test/resources/sql-tests/results/predicate-functions.sql.out
@@ -743,6 +743,42 @@ struct<(NOT (NULL IN (1, 2, NULL))):boolean>
 -- !query output
 NULL
 
+-- !query
+select 1 in ()
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+  "sqlState" : "42000",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 10,
+    "stopIndex" : 14,
+    "fragment" : "in ()"
+  } ]
+}
+
+
+-- !query
+select 1 not in ()
+-- !query schema
+struct<>
+-- !query output
+org.apache.spark.sql.catalyst.parser.ParseException
+{
+  "errorClass" : "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+  "sqlState" : "42000",
+  "queryContext" : [ {
+    "objectType" : "",
+    "objectName" : "",
+    "startIndex" : 10,
+    "stopIndex" : 18,
+    "fragment" : "not in ()"
+  } ]
+}
 
 -- !query
 select 1 between 0 and 2
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala
 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala
index 629d85f19b0a..a58e2d9a8a50 100644
--- 
a/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala
+++ 
b/sql/core/src/test/scala/org/apache/spark/sql/errors/QueryParsingErrorsSuite.scala
@@ -726,4 +726,42 @@ class QueryParsingErrorsSuite extends QueryTest with 
SharedSparkSession with SQL
         start = 32,
         stop = 58))
   }
+
+  test("INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE: Empty IN clause") {
+    // Test with single column IN ()
+    // PredicateContext captures "IN ()" starting at position 33
+    checkError(
+      exception = parseException("SELECT * FROM range(10) WHERE id IN ()"),
+      condition = "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+      sqlState = "42000",
+      parameters = Map.empty,
+      context = ExpectedContext(
+        fragment = "IN ()",
+        start = 33,
+        stop = 37))
+
+    // Test with expression IN ()
+    // PredicateContext captures "IN ()" starting at position 39
+    checkError(
+      exception = parseException("SELECT * FROM range(10) WHERE (id + 1) IN 
()"),
+      condition = "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+      sqlState = "42000",
+      parameters = Map.empty,
+      context = ExpectedContext(
+        fragment = "IN ()",
+        start = 39,
+        stop = 43))
+
+    // Test with NOT IN ()
+    // PredicateContext captures "NOT IN ()" starting at position 33
+    checkError(
+      exception = parseException("SELECT * FROM range(10) WHERE id NOT IN ()"),
+      condition = "INVALID_SQL_SYNTAX.EMPTY_IN_PREDICATE",
+      sqlState = "42000",
+      parameters = Map.empty,
+      context = ExpectedContext(
+        fragment = "NOT IN ()",
+        start = 33,
+        stop = 41))
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to