This is an automated email from the ASF dual-hosted git repository. wenchen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push: new 30585c1d69dd [SPARK-49968][SQL] The split function produces incorrect results with an empty regex and a limit 30585c1d69dd is described below commit 30585c1d69dde925a77fa4c0d822f99b381390c4 Author: DenineLu <denin...@163.com> AuthorDate: Fri Jul 25 17:51:42 2025 +0800 [SPARK-49968][SQL] The split function produces incorrect results with an empty regex and a limit ### What changes were proposed in this pull request? After [SPARK-40194](https://github.com/apache/spark/pull/37631), the current behavior of the split function is as follows: ``` select split('hello', 'h', 1) // result is ["hello"] select split('hello', '-', 1) // result is ["hello"] select split('hello', '', 1) // result is ["h"] select split('1A2A3A4', 'A', 3) // result is ["1","2","3A4"] select split('1A2A3A4', '', 3) // result is ["1","A","2"] ``` However, according to the function's description, when the limit is greater than zero, the last element of the split result should contain the remaining part of the input string. ``` Arguments: * str - a string expression to split. * regex - a string representing a regular expression. The regex string should be a Java regular expression. * limit - an integer expression which controls the number of times the regex is applied. * limit > 0: The resulting array's length will not be more than `limit`, and the resulting array's last entry will contain all input beyond the last matched regex. * limit <= 0: `regex` will be applied as many times as possible, and the resulting array can be of any size. ``` So, the split function produces incorrect results with an empty regex and a limit. The correct result should be: ``` select split('hello', '', 1) // result is ["hello"] select split('1A2A3A4', '', 3) // result is ["1","A","2A3A4"] ``` ### Why are the changes needed? Fix correctness issue. ### Does this PR introduce _any_ user-facing change? Yes. When the empty regex parameter is provided along with a limit parameter greater than 0, the output of the split function changes. Before this patch ``` select split('hello', '', 1) // result is ["h"] select split('1A2A3A4', '', 3) // result is ["1","A","2"] ``` After this patch ``` select split('hello', '', 1) // result is ["hello"] select split('1A2A3A4', '', 3) // result is ["1","A","2A3A4"] ``` ### How was this patch tested? Unit tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #48470 from DenineLu/fix_split_on_empty_regex. Authored-by: DenineLu <denin...@163.com> Signed-off-by: Wenchen Fan <wenc...@databricks.com> --- .../catalyst/util/CollationAwareUTF8String.java | 5 +- .../spark/sql/catalyst/util/CollationSupport.java | 6 +- .../org/apache/spark/unsafe/types/UTF8String.java | 19 +++++++ .../apache/spark/unsafe/types/UTF8StringSuite.java | 2 +- .../catalyst/expressions/complexTypeCreator.scala | 13 +++-- .../catalyst/expressions/regexpExpressions.scala | 25 +++++++-- .../org/apache/spark/sql/internal/SQLConf.scala | 15 +++++ .../CollationRegexpExpressionsSuite.scala | 38 ++++++++----- .../expressions/RegexpExpressionsSuite.scala | 2 +- .../nonansi/string-functions.sql.out | 56 +++++++++++++++++++ .../analyzer-results/string-functions.sql.out | 56 +++++++++++++++++++ .../sql-tests/inputs/string-functions.sql | 8 +++ .../results/nonansi/string-functions.sql.out | 64 ++++++++++++++++++++++ .../sql-tests/results/string-functions.sql.out | 64 ++++++++++++++++++++++ 14 files changed, 344 insertions(+), 29 deletions(-) diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java index e455e531de0d..2b9457c58560 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationAwareUTF8String.java @@ -1529,9 +1529,10 @@ public class CollationAwareUTF8String { } public static UTF8String[] splitSQL(final UTF8String input, final UTF8String delim, - final int limit, final int collationId) { + final int limit, final int collationId, boolean legacySplitTruncate) { if (CollationFactory.fetchCollation(collationId).isUtf8BinaryType) { - return input.split(delim, limit); + return legacySplitTruncate ? + input.splitLegacyTruncate(delim, limit) : input.split(delim, limit); } else if (CollationFactory.fetchCollation(collationId).isUtf8LcaseType) { return lowercaseSplitSQL(input, delim, limit); } else { diff --git a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java index 135250e482b1..f950fd864c57 100644 --- a/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java +++ b/common/unsafe/src/main/java/org/apache/spark/sql/catalyst/util/CollationSupport.java @@ -706,8 +706,10 @@ public final class CollationSupport { public static UTF8String lowercaseRegex(final UTF8String regex) { return UTF8String.concat(lowercaseRegexPrefix, regex); } - public static UTF8String collationAwareRegex(final UTF8String regex, final int collationId) { - return supportsLowercaseRegex(collationId) ? lowercaseRegex(regex) : regex; + public static UTF8String collationAwareRegex( + final UTF8String regex, final int collationId, boolean notIgnoreEmpty) { + return supportsLowercaseRegex(collationId) && (notIgnoreEmpty || regex.numBytes() != 0) + ? lowercaseRegex(regex) : regex; } /** diff --git a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java index caf8461b0b5d..2f03a7f69116 100644 --- a/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java +++ b/common/unsafe/src/main/java/org/apache/spark/unsafe/types/UTF8String.java @@ -1483,6 +1483,25 @@ public final class UTF8String implements Comparable<UTF8String>, Externalizable, } public UTF8String[] split(UTF8String pattern, int limit) { + // For the empty `pattern` a `split` function ignores trailing empty strings unless original + // string is empty. + if (numBytes() != 0 && pattern.numBytes() == 0) { + int newLimit = limit > numChars() || limit <= 0 ? numChars() : limit; + byte[] input = getBytes(); + int byteIndex = 0; + UTF8String[] result = new UTF8String[newLimit]; + for (int charIndex = 0; charIndex < newLimit - 1; charIndex++) { + int currCharNumBytes = numBytesForFirstByte(input[byteIndex]); + result[charIndex] = UTF8String.fromBytes(input, byteIndex, currCharNumBytes); + byteIndex += currCharNumBytes; + } + result[newLimit - 1] = UTF8String.fromBytes(input, byteIndex, numBytes() - byteIndex); + return result; + } + return split(pattern.toString(), limit); + } + + public UTF8String[] splitLegacyTruncate(UTF8String pattern, int limit) { // For the empty `pattern` a `split` function ignores trailing empty strings unless original // string is empty. if (numBytes() != 0 && pattern.numBytes() == 0) { diff --git a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java index c4a66fdffdd4..19ec1461ec81 100644 --- a/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java +++ b/common/unsafe/src/test/java/org/apache/spark/unsafe/types/UTF8StringSuite.java @@ -432,7 +432,7 @@ public class UTF8StringSuite { new UTF8String[]{fromString("a"), fromString("b")}, fromString("ab").split(fromString(""), 100)); assertArrayEquals( - new UTF8String[]{fromString("a")}, + new UTF8String[]{fromString("ab")}, fromString("ab").split(fromString(""), 1)); assertArrayEquals( new UTF8String[]{fromString("")}, diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala index fa6938ac4cf1..f4232cae1066 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/complexTypeCreator.scala @@ -597,18 +597,21 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E private final lazy val collationId: Int = text.dataType.asInstanceOf[StringType].collationId + private lazy val legacySplitTruncate = + SQLConf.get.getConf(SQLConf.LEGACY_TRUNCATE_FOR_EMPTY_REGEX_SPLIT) + override def nullSafeEval( inputString: Any, stringDelimiter: Any, keyValueDelimiter: Any): Any = { val keyValues = CollationAwareUTF8String.splitSQL(inputString.asInstanceOf[UTF8String], - stringDelimiter.asInstanceOf[UTF8String], -1, collationId) + stringDelimiter.asInstanceOf[UTF8String], -1, collationId, legacySplitTruncate) val keyValueDelimiterUTF8String = keyValueDelimiter.asInstanceOf[UTF8String] var i = 0 while (i < keyValues.length) { val keyValueArray = CollationAwareUTF8String.splitSQL( - keyValues(i), keyValueDelimiterUTF8String, 2, collationId) + keyValues(i), keyValueDelimiterUTF8String, 2, collationId, legacySplitTruncate) val key = keyValueArray(0) val value = if (keyValueArray.length < 2) null else keyValueArray(1) mapBuilder.put(key, value) @@ -623,9 +626,11 @@ case class StringToMap(text: Expression, pairDelim: Expression, keyValueDelim: E nullSafeCodeGen(ctx, ev, (text, pd, kvd) => s""" - |UTF8String[] $keyValues = CollationAwareUTF8String.splitSQL($text, $pd, -1, $collationId); + |UTF8String[] $keyValues = + | CollationAwareUTF8String.splitSQL($text, $pd, -1, $collationId, $legacySplitTruncate); |for(UTF8String kvEntry: $keyValues) { - | UTF8String[] kv = CollationAwareUTF8String.splitSQL(kvEntry, $kvd, 2, $collationId); + | UTF8String[] kv = CollationAwareUTF8String.splitSQL( + | kvEntry, $kvd, 2, $collationId, $legacySplitTruncate); | $builderTerm.put(kv[0], kv.length == 2 ? kv[1] : null); |} |${ev.value} = $builderTerm.build(); diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala index c90a93a0b3a5..c6e5c480f3c2 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/regexpExpressions.scala @@ -37,8 +37,8 @@ import org.apache.spark.sql.catalyst.trees.BinaryLike import org.apache.spark.sql.catalyst.trees.TreePattern.{LIKE_FAMLIY, REGEXP_EXTRACT_FAMILY, REGEXP_REPLACE, TreePattern} import org.apache.spark.sql.catalyst.util.{CollationSupport, GenericArrayData, StringUtils} import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} -import org.apache.spark.sql.internal.types.{ - StringTypeBinaryLcase, StringTypeWithCollation} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.internal.types.{StringTypeBinaryLcase, StringTypeWithCollation} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @@ -580,20 +580,33 @@ case class StringSplit(str: Expression, regex: Expression, limit: Expression) final lazy val collationId: Int = str.dataType.asInstanceOf[StringType].collationId + private lazy val legacySplitTruncate = + SQLConf.get.getConf(SQLConf.LEGACY_TRUNCATE_FOR_EMPTY_REGEX_SPLIT) + def this(exp: Expression, regex: Expression) = this(exp, regex, Literal(-1)) override def nullSafeEval(string: Any, regex: Any, limit: Any): Any = { - val pattern = CollationSupport.collationAwareRegex(regex.asInstanceOf[UTF8String], collationId) - val strings = string.asInstanceOf[UTF8String].split(pattern, limit.asInstanceOf[Int]) + val pattern = CollationSupport.collationAwareRegex( + regex.asInstanceOf[UTF8String], collationId, legacySplitTruncate) + val strings = if (legacySplitTruncate) { + string.asInstanceOf[UTF8String].splitLegacyTruncate(pattern, limit.asInstanceOf[Int]) + } else { + string.asInstanceOf[UTF8String].split(pattern, limit.asInstanceOf[Int]) + } new GenericArrayData(strings.asInstanceOf[Array[Any]]) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { val arrayClass = classOf[GenericArrayData].getName + val pattern = ctx.freshName("pattern") nullSafeCodeGen(ctx, ev, (str, regex, limit) => { // Array in java is covariant, so we don't need to cast UTF8String[] to Object[]. - s"""${ev.value} = new $arrayClass($str.split( - |CollationSupport.collationAwareRegex($regex, $collationId),$limit));""".stripMargin + s""" + |UTF8String $pattern = + | CollationSupport.collationAwareRegex($regex, $collationId, $legacySplitTruncate); + |${ev.value} = new $arrayClass($legacySplitTruncate ? + | $str.splitLegacyTruncate($pattern, $limit) : $str.split($pattern, $limit)); + |""".stripMargin }) } diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala index ff919d2b278a..f4aea61f585f 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala @@ -6167,6 +6167,21 @@ object SQLConf { .booleanConf .createWithDefault(true) + val LEGACY_TRUNCATE_FOR_EMPTY_REGEX_SPLIT = + buildConf("spark.sql.legacy.truncateForEmptyRegexSplit") + .internal() + .doc("When set to true, splitting a string of length n using an empty regex with a " + + "positive limit discards the last n - limit characters." + + "For example: SELECT split('abcd', '', 2) returns ['a', 'b']." + + "When set to false, the last element of the resulting array contains all input beyond " + + "the last matched regex." + + "For example: SELECT split('abcd', '', 2) returns ['a', 'bcd']." + + "According to the description of the split function, this should be set to false by " + + "default. See SPARK-49968 for details.") + .version("4.1.0") + .booleanConf + .createWithDefault(false) + /** * Holds information about keys that have been deprecated. * diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala index 2c1244eec365..3fc0d3b25a3d 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/CollationRegexpExpressionsSuite.scala @@ -57,25 +57,37 @@ class CollationRegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalH } test("StringSplit expression with collated strings") { - case class StringSplitTestCase[R](s: String, r: String, collation: String, expected: R) + case class StringSplitTestCase[R](s: String, r: String, collation: String, + expected: R, limit: Int) val testCases = Seq( - StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY", Seq("1", "2", "3", "")), - StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY", Seq("1A2B3C")), - StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_LCASE", Seq("1", "2", "3", "")), - StringSplitTestCase("1A2B3C", "[abc]", "UTF8_LCASE", Seq("1", "2", "3", "")), - StringSplitTestCase("1A2B3C", "[1-9]+", "UTF8_BINARY", Seq("", "A", "B", "C")), - StringSplitTestCase("", "", "UTF8_BINARY", Seq("")), - StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C")), - StringSplitTestCase("", "[1-9]+", "UTF8_BINARY", Seq("")), - StringSplitTestCase(null, "[1-9]+", "UTF8_BINARY", null), - StringSplitTestCase("1A2B3C", null, "UTF8_BINARY", null), - StringSplitTestCase(null, null, "UTF8_BINARY", null) + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_BINARY", Seq("1", "2", "3", ""), -1), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_BINARY", Seq("1A2B3C"), -1), + StringSplitTestCase("1A2B3C", "[ABC]", "UTF8_LCASE", Seq("1", "2", "3", ""), -1), + StringSplitTestCase("1A2B3C", "[abc]", "UTF8_LCASE", Seq("1", "2", "3", ""), -1), + StringSplitTestCase("1A2B3C", "[1-9]+", "UTF8_BINARY", Seq("", "A", "B", "C"), -1), + StringSplitTestCase("", "", "UTF8_BINARY", Seq(""), -1), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C"), -1), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1", "A", "2", "B", "3", "C"), -1), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C"), 0), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1", "A", "2", "B", "3", "C"), 0), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1A2B3C"), 1), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1A2B3C"), 1), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2B3C"), 3), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1", "A", "2B3C"), 3), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C"), 6), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1", "A", "2", "B", "3", "C"), 6), + StringSplitTestCase("1A2B3C", "", "UTF8_BINARY", Seq("1", "A", "2", "B", "3", "C"), 100), + StringSplitTestCase("1A2B3C", "", "UTF8_LCASE", Seq("1", "A", "2", "B", "3", "C"), 100), + StringSplitTestCase("", "[1-9]+", "UTF8_BINARY", Seq(""), -1), + StringSplitTestCase(null, "[1-9]+", "UTF8_BINARY", null, -1), + StringSplitTestCase("1A2B3C", null, "UTF8_BINARY", null, -1), + StringSplitTestCase(null, null, "UTF8_BINARY", null, -1) ) testCases.foreach(t => { // StringSplit checkEvaluation(StringSplit( Literal.create(t.s, StringType(CollationFactory.collationNameToId(t.collation))), - Literal.create(t.r, StringType), -1), t.expected) + Literal.create(t.r, StringType), t.limit), t.expected) }) } diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala index 12aeb7d6685b..e1654b9456f8 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/RegexpExpressionsSuite.scala @@ -564,7 +564,7 @@ class RegexpExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { checkEvaluation( StringSplit(Literal("hello"), Literal(""), 5), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( - StringSplit(Literal("hello"), Literal(""), 3), Seq("h", "e", "l"), row1) + StringSplit(Literal("hello"), Literal(""), 3), Seq("h", "e", "llo"), row1) checkEvaluation( StringSplit(Literal("hello"), Literal(""), 100), Seq("h", "e", "l", "l", "o"), row1) checkEvaluation( diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/string-functions.sql.out index ee4ad922fa8a..b352d0edf896 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/nonansi/string-functions.sql.out @@ -128,6 +128,41 @@ Project [split(hello, , -1) AS split(hello, , -1)#x] +- OneRowRelation +-- !query +SELECT split('hello', '', 0) +-- !query analysis +Project [split(hello, , 0) AS split(hello, , 0)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 1) +-- !query analysis +Project [split(hello, , 1) AS split(hello, , 1)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 3) +-- !query analysis +Project [split(hello, , 3) AS split(hello, , 3)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 5) +-- !query analysis +Project [split(hello, , 5) AS split(hello, , 5)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 100) +-- !query analysis +Project [split(hello, , 100) AS split(hello, , 100)#x] ++- OneRowRelation + + -- !query SELECT split('', '') -- !query analysis @@ -135,6 +170,27 @@ Project [split(, , -1) AS split(, , -1)#x] +- OneRowRelation +-- !query +SELECT split('', '', -1) +-- !query analysis +Project [split(, , -1) AS split(, , -1)#x] ++- OneRowRelation + + +-- !query +SELECT split('', '', 0) +-- !query analysis +Project [split(, , 0) AS split(, , 0)#x] ++- OneRowRelation + + +-- !query +SELECT split('', '', 1) +-- !query analysis +Project [split(, , 1) AS split(, , 1)#x] ++- OneRowRelation + + -- !query SELECT split('abc', null) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out index ee4ad922fa8a..b352d0edf896 100644 --- a/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/analyzer-results/string-functions.sql.out @@ -128,6 +128,41 @@ Project [split(hello, , -1) AS split(hello, , -1)#x] +- OneRowRelation +-- !query +SELECT split('hello', '', 0) +-- !query analysis +Project [split(hello, , 0) AS split(hello, , 0)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 1) +-- !query analysis +Project [split(hello, , 1) AS split(hello, , 1)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 3) +-- !query analysis +Project [split(hello, , 3) AS split(hello, , 3)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 5) +-- !query analysis +Project [split(hello, , 5) AS split(hello, , 5)#x] ++- OneRowRelation + + +-- !query +SELECT split('hello', '', 100) +-- !query analysis +Project [split(hello, , 100) AS split(hello, , 100)#x] ++- OneRowRelation + + -- !query SELECT split('', '') -- !query analysis @@ -135,6 +170,27 @@ Project [split(, , -1) AS split(, , -1)#x] +- OneRowRelation +-- !query +SELECT split('', '', -1) +-- !query analysis +Project [split(, , -1) AS split(, , -1)#x] ++- OneRowRelation + + +-- !query +SELECT split('', '', 0) +-- !query analysis +Project [split(, , 0) AS split(, , 0)#x] ++- OneRowRelation + + +-- !query +SELECT split('', '', 1) +-- !query analysis +Project [split(, , 1) AS split(, , 1)#x] ++- OneRowRelation + + -- !query SELECT split('abc', null) -- !query analysis diff --git a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql index 878fa992f81b..7559c45ec103 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/string-functions.sql @@ -27,7 +27,15 @@ select right("abcd", -2), right("abcd", 0), right("abcd", 'a'); SELECT split('aa1cc2ee3', '[1-9]+'); SELECT split('aa1cc2ee3', '[1-9]+', 2); SELECT split('hello', ''); +SELECT split('hello', '', 0); +SELECT split('hello', '', 1); +SELECT split('hello', '', 3); +SELECT split('hello', '', 5); +SELECT split('hello', '', 100); SELECT split('', ''); +SELECT split('', '', -1); +SELECT split('', '', 0); +SELECT split('', '', 1); SELECT split('abc', null); SELECT split(null, 'b'); diff --git a/sql/core/src/test/resources/sql-tests/results/nonansi/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/nonansi/string-functions.sql.out index bb4d615deb29..2c968a7b2f77 100644 --- a/sql/core/src/test/resources/sql-tests/results/nonansi/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/nonansi/string-functions.sql.out @@ -147,6 +147,46 @@ struct<split(hello, , -1):array<string>> ["h","e","l","l","o"] +-- !query +SELECT split('hello', '', 0) +-- !query schema +struct<split(hello, , 0):array<string>> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', 1) +-- !query schema +struct<split(hello, , 1):array<string>> +-- !query output +["hello"] + + +-- !query +SELECT split('hello', '', 3) +-- !query schema +struct<split(hello, , 3):array<string>> +-- !query output +["h","e","llo"] + + +-- !query +SELECT split('hello', '', 5) +-- !query schema +struct<split(hello, , 5):array<string>> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', 100) +-- !query schema +struct<split(hello, , 100):array<string>> +-- !query output +["h","e","l","l","o"] + + -- !query SELECT split('', '') -- !query schema @@ -155,6 +195,30 @@ struct<split(, , -1):array<string>> [""] +-- !query +SELECT split('', '', -1) +-- !query schema +struct<split(, , -1):array<string>> +-- !query output +[""] + + +-- !query +SELECT split('', '', 0) +-- !query schema +struct<split(, , 0):array<string>> +-- !query output +[""] + + +-- !query +SELECT split('', '', 1) +-- !query schema +struct<split(, , 1):array<string>> +-- !query output +[""] + + -- !query SELECT split('abc', null) -- !query schema diff --git a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out index 34a560d771e8..1d706cba88c9 100644 --- a/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/string-functions.sql.out @@ -181,6 +181,46 @@ struct<split(hello, , -1):array<string>> ["h","e","l","l","o"] +-- !query +SELECT split('hello', '', 0) +-- !query schema +struct<split(hello, , 0):array<string>> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', 1) +-- !query schema +struct<split(hello, , 1):array<string>> +-- !query output +["hello"] + + +-- !query +SELECT split('hello', '', 3) +-- !query schema +struct<split(hello, , 3):array<string>> +-- !query output +["h","e","llo"] + + +-- !query +SELECT split('hello', '', 5) +-- !query schema +struct<split(hello, , 5):array<string>> +-- !query output +["h","e","l","l","o"] + + +-- !query +SELECT split('hello', '', 100) +-- !query schema +struct<split(hello, , 100):array<string>> +-- !query output +["h","e","l","l","o"] + + -- !query SELECT split('', '') -- !query schema @@ -189,6 +229,30 @@ struct<split(, , -1):array<string>> [""] +-- !query +SELECT split('', '', -1) +-- !query schema +struct<split(, , -1):array<string>> +-- !query output +[""] + + +-- !query +SELECT split('', '', 0) +-- !query schema +struct<split(, , 0):array<string>> +-- !query output +[""] + + +-- !query +SELECT split('', '', 1) +-- !query schema +struct<split(, , 1):array<string>> +-- !query output +[""] + + -- !query SELECT split('abc', null) -- !query schema --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@spark.apache.org For additional commands, e-mail: commits-h...@spark.apache.org