This is an automated email from the ASF dual-hosted git repository.
maxgekk pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new c0d9ca3be14c [SPARK-45400][SQL][DOCS] Refer to the unescaping rules
from expression descriptions
c0d9ca3be14c is described below
commit c0d9ca3be14cb0ec8d8f9920d3ecc4aac3cf5adc
Author: Max Gekk <[email protected]>
AuthorDate: Thu Oct 5 22:22:29 2023 +0300
[SPARK-45400][SQL][DOCS] Refer to the unescaping rules from expression
descriptions
### What changes were proposed in this pull request?
In the PR, I propose to refer to the unescaping rules added by
https://github.com/apache/spark/pull/43152 from expression descriptions like in
`Like`, see
<img width="1057" alt="Screenshot 2023-10-05 at 19 15 17"
src="https://github.com/apache/spark/assets/1580697/6a332b50-f2c8-4549-848a-61519c9f964e">
### Why are the changes needed?
To improve user experience w/ Spark SQL.
### Does this PR introduce _any_ user-facing change?
No.
### How was this patch tested?
Manually generated docs and checked by eyes.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43203 from MaxGekk/link-to-escape-doc.
Authored-by: Max Gekk <[email protected]>
Signed-off-by: Max Gekk <[email protected]>
---
docs/sql-ref-literals.md | 2 +
.../catalyst/expressions/regexpExpressions.scala | 70 ++++++++++++++--------
2 files changed, 47 insertions(+), 25 deletions(-)
diff --git a/docs/sql-ref-literals.md b/docs/sql-ref-literals.md
index e9447af71c54..2a02a22bd6f0 100644
--- a/docs/sql-ref-literals.md
+++ b/docs/sql-ref-literals.md
@@ -62,6 +62,8 @@ The following escape sequences are recognized in regular
string literals (withou
- `\_` -> `\_`;
- `\<other char>` -> `<other char>`, skip the slash and leave the character as
is.
+The unescaping rules above can be turned off by setting the SQL config
`spark.sql.parser.escapedStringLiterals` to `true`.
+
#### Examples
```sql
diff --git
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
index 69d90296d7ff..87ea8b5a102a 100644
---
a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
+++
b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala
@@ -77,7 +77,7 @@ abstract class StringRegexExpression extends BinaryExpression
}
}
-// scalastyle:off line.contains.tab
+// scalastyle:off line.contains.tab line.size.limit
/**
* Simple RegEx pattern matching function
*/
@@ -92,11 +92,14 @@ abstract class StringRegexExpression extends
BinaryExpression
_ matches any one character in the input (similar to . in posix
regular expressions)\
% matches zero or more characters in the input (similar to .* in
posix regular
expressions)<br><br>
- Since Spark 2.0, string literals are unescaped in our SQL parser.
For example, in order
- to match "\abc", the pattern should be "\\abc".<br><br>
+ Since Spark 2.0, string literals are unescaped in our SQL parser,
see the unescaping
+ rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, in order to match "\abc", the pattern should be
"\\abc".<br><br>
When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled,
it falls back
to Spark 1.6 behavior regarding string literal parsing. For example,
if the config is
- enabled, the pattern to match "\abc" should be "\abc".
+ enabled, the pattern to match "\abc" should be "\abc".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
* escape - an character added since Spark 3.0. The default escape
character is the '\'.
If an escape character precedes a special symbol or another escape
character, the
following character is matched literally. It is invalid to escape
any other character.
@@ -121,7 +124,7 @@ abstract class StringRegexExpression extends
BinaryExpression
""",
since = "1.0.0",
group = "predicate_funcs")
-// scalastyle:on line.contains.tab
+// scalastyle:on line.contains.tab line.size.limit
case class Like(left: Expression, right: Expression, escapeChar: Char)
extends StringRegexExpression {
@@ -207,11 +210,14 @@ case class Like(left: Expression, right: Expression,
escapeChar: Char)
_ matches any one character in the input (similar to . in posix
regular expressions)<br><br>
% matches zero or more characters in the input (similar to .* in
posix regular
expressions)<br><br>
- Since Spark 2.0, string literals are unescaped in our SQL parser.
For example, in order
- to match "\abc", the pattern should be "\\abc".<br><br>
+ Since Spark 2.0, string literals are unescaped in our SQL parser,
see the unescaping
+ rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, in order to match "\abc", the pattern should be
"\\abc".<br><br>
When SQL config 'spark.sql.parser.escapedStringLiterals' is enabled,
it falls back
to Spark 1.6 behavior regarding string literal parsing. For example,
if the config is
- enabled, the pattern to match "\abc" should be "\abc".
+ enabled, the pattern to match "\abc" should be "\abc".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
* escape - an character added since Spark 3.0. The default escape
character is the '\'.
If an escape character precedes a special symbol or another escape
character, the
following character is matched literally. It is invalid to escape
any other character.
@@ -412,7 +418,7 @@ case class NotLikeAny(child: Expression, patterns:
Seq[UTF8String]) extends Like
copy(child = newChild)
}
-// scalastyle:off line.contains.tab
+// scalastyle:off line.contains.tab line.size.limit
@ExpressionDescription(
usage = "_FUNC_(str, regexp) - Returns true if `str` matches `regexp`, or
false otherwise.",
arguments = """
@@ -421,12 +427,14 @@ case class NotLikeAny(child: Expression, patterns:
Seq[UTF8String]) extends Like
* regexp - a string expression. The regex string should be a Java
regular expression.
Since Spark 2.0, string literals (including regex patterns) are
unescaped in our SQL
- parser. For example, to match "\abc", a regular expression for
`regexp` can be
- "^\\abc$".
+ parser, see the unescaping rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, to match "\abc", a regular expression for `regexp` can
be "^\\abc$".
There is a SQL config 'spark.sql.parser.escapedStringLiterals' that
can be used to
fallback to the Spark 1.6 behavior regarding string literal parsing.
For example,
- if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".
+ if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
""",
examples = """
Examples:
@@ -444,7 +452,7 @@ case class NotLikeAny(child: Expression, patterns:
Seq[UTF8String]) extends Like
""",
since = "1.0.0",
group = "predicate_funcs")
-// scalastyle:on line.contains.tab
+// scalastyle:on line.contains.tab line.size.limit
case class RLike(left: Expression, right: Expression) extends
StringRegexExpression {
override def escape(v: String): String = v
@@ -573,11 +581,13 @@ case class StringSplit(str: Expression, regex:
Expression, limit: Expression)
* regexp - a string representing a regular expression. The regex string
should be a
Java regular expression.<br><br>
Since Spark 2.0, string literals (including regex patterns) are
unescaped in our SQL
- parser. For example, to match "\abc", a regular expression for
`regexp` can be
- "^\\abc$".<br><br>
+ parser, see the unescaping rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, to match "\abc", a regular expression for `regexp` can
be "^\\abc$".<br><br>
There is a SQL config 'spark.sql.parser.escapedStringLiterals' that
can be used to
fallback to the Spark 1.6 behavior regarding string literal parsing.
For example,
- if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".
+ if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
* rep - a string expression to replace matched substrings.
* position - a positive integer literal that indicates the position
within `str` to begin searching.
The default is 1. If position is greater than the number of
characters in `str`, the result is `str`.
@@ -774,6 +784,7 @@ abstract class RegExpExtractBase
*
* NOTE: this expression is not THREAD-SAFE, as it has some internal mutable
status.
*/
+// scalastyle:off line.size.limit
@ExpressionDescription(
usage = """
_FUNC_(str, regexp[, idx]) - Extract the first string in the `str` that
match the `regexp`
@@ -785,11 +796,13 @@ abstract class RegExpExtractBase
* regexp - a string representing a regular expression. The regex string
should be a
Java regular expression.<br><br>
Since Spark 2.0, string literals (including regex patterns) are
unescaped in our SQL
- parser. For example, to match "\abc", a regular expression for
`regexp` can be
- "^\\abc$".<br><br>
+ parser, see the unescaping rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, to match "\abc", a regular expression for `regexp` can
be "^\\abc$".<br><br>
There is a SQL config 'spark.sql.parser.escapedStringLiterals' that
can be used to
fallback to the Spark 1.6 behavior regarding string literal parsing.
For example,
- if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".
+ if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
* idx - an integer expression that representing the group index. The
regex maybe contains
multiple groups. `idx` indicates which regex group to extract. The
group index should
be non-negative. The minimum value of `idx` is 0, which means
matching the entire
@@ -803,6 +816,7 @@ abstract class RegExpExtractBase
""",
since = "1.5.0",
group = "string_funcs")
+// scalastyle:on line.size.limit
case class RegExpExtract(subject: Expression, regexp: Expression, idx:
Expression)
extends RegExpExtractBase {
def this(s: Expression, r: Expression) = this(s, r, Literal(1))
@@ -866,6 +880,7 @@ case class RegExpExtract(subject: Expression, regexp:
Expression, idx: Expressio
*
* NOTE: this expression is not THREAD-SAFE, as it has some internal mutable
status.
*/
+// scalastyle:off line.size.limit
@ExpressionDescription(
usage = """
_FUNC_(str, regexp[, idx]) - Extract all strings in the `str` that match
the `regexp`
@@ -877,11 +892,13 @@ case class RegExpExtract(subject: Expression, regexp:
Expression, idx: Expressio
* regexp - a string representing a regular expression. The regex string
should be a
Java regular expression.<br><br>
Since Spark 2.0, string literals (including regex patterns) are
unescaped in our SQL
- parser. For example, to match "\abc", a regular expression for
`regexp` can be
- "^\\abc$".<br><br>
+ parser, see the unescaping rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, to match "\abc", a regular expression for `regexp` can
be "^\\abc$".<br><br>
There is a SQL config 'spark.sql.parser.escapedStringLiterals' that
can be used to
fallback to the Spark 1.6 behavior regarding string literal parsing.
For example,
- if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".
+ if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
* idx - an integer expression that representing the group index. The
regex may contains
multiple groups. `idx` indicates which regex group to extract. The
group index should
be non-negative. The minimum value of `idx` is 0, which means
matching the entire
@@ -895,6 +912,7 @@ case class RegExpExtract(subject: Expression, regexp:
Expression, idx: Expressio
""",
since = "3.1.0",
group = "string_funcs")
+// scalastyle:on line.size.limit
case class RegExpExtractAll(subject: Expression, regexp: Expression, idx:
Expression)
extends RegExpExtractBase {
def this(s: Expression, r: Expression) = this(s, r, Literal(1))
@@ -1047,11 +1065,13 @@ case class RegExpSubStr(left: Expression, right:
Expression)
* regexp - a string representing a regular expression. The regex string
should be a
Java regular expression.<br><br>
Since Spark 2.0, string literals (including regex patterns) are
unescaped in our SQL
- parser. For example, to match "\abc", a regular expression for
`regexp` can be
- "^\\abc$".<br><br>
+ parser, see the unescaping rules at <a
href="https://spark.apache.org/docs/latest/sql-ref-literals.html#string-literal">String
Literal</a>.
+ For example, to match "\abc", a regular expression for `regexp` can
be "^\\abc$".<br><br>
There is a SQL config 'spark.sql.parser.escapedStringLiterals' that
can be used to
fallback to the Spark 1.6 behavior regarding string literal parsing.
For example,
- if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".
+ if the config is enabled, the `regexp` that can match "\abc" is
"^\abc$".<br><br>
+ It's recommended to use a raw string literal (with the `r` prefix)
to avoid escaping
+ special characters in the pattern string if exists.
""",
examples = """
Examples:
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]