This is an automated email from the ASF dual-hosted git repository.
jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 61009f66dfe Optimize splitPart scalar function to reduce allocation
(#17660)
61009f66dfe is described below
commit 61009f66dfeb52a69d5762273ea928b72cf834e2
Author: Sai Valla <[email protected]>
AuthorDate: Thu Mar 26 15:28:06 2026 -0700
Optimize splitPart scalar function to reduce allocation (#17660)
---
.../common/function/scalar/StringFunctions.java | 131 +++++++++++++++--
.../function/scalar/StringFunctionsTest.java | 112 +++++++++++++++
.../org/apache/pinot/perf/BenchmarkSplitPart.java | 160 +++++++++++++++++++++
3 files changed, 395 insertions(+), 8 deletions(-)
diff --git
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
index 03d8d1aa2d0..3127ea31b39 100644
---
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
+++
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
@@ -18,6 +18,7 @@
*/
package org.apache.pinot.common.function.scalar;
+import com.google.common.annotations.VisibleForTesting;
import java.io.UnsupportedEncodingException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
@@ -578,21 +579,135 @@ public class StringFunctions {
/**
* TODO: Revisit if index should be one-based (both Presto and Postgres use
one-based index, which starts with 1)
- * @param input
- * @param delimiter
+ * @param input the input String to be split into parts.
+ * @param delimiter the specified delimiter to split the input string.
* @param index we allow negative value for index which indicates the index
from the end.
* @return splits string on specified delimiter and returns String at
specified index from the split.
*/
@ScalarFunction
public static String splitPart(String input, String delimiter, int index) {
- String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter);
- if (index >= 0 && index < splitString.length) {
- return splitString[index];
- } else if (index < 0 && index >= -splitString.length) {
- return splitString[splitString.length + index];
- } else {
+ // compare with BenchmarkSplitPart (perf module) for future changes
+ int delimLen = delimiter.length();
+ if (delimLen == 0) {
+ // Empty delimiter splits on whitespace (preserving existing behavior
via Apache Commons)
+ return splitPartArrayBased(StringUtils.splitByWholeSeparator(input,
delimiter), index);
+ }
+
+ int len = input.length();
+ if (len == 0) {
return "null";
}
+
+ // skip leading delimiters
+ int start = 0;
+ while (start < len && input.startsWith(delimiter, start)) {
+ start += delimLen;
+ }
+ // Guard against Integer.MIN_VALUE since negating it overflows (remains
negative)
+ if (index == Integer.MIN_VALUE) {
+ return "null";
+ }
+ // optimization for negative index with single-char delimiter since common
case
+ // multi-char delimiter with negative index can be handled in future since
less common and more complex
+ if (index < 0 && delimLen == 1) {
+ return splitPartNegativeIdxSingleCharDelim(input, delimiter.charAt(0),
-index, len, start);
+ }
+
+ // convert negative index to positive by counting total fields
+ int adjustedIndex = index;
+ if (adjustedIndex < 0) {
+ int totalFields = 0;
+ int pos = start;
+ while (pos <= len) {
+ totalFields++;
+ int end = input.indexOf(delimiter, pos);
+ if (end == -1) {
+ break;
+ }
+ pos = end + delimLen;
+ while (pos < len && input.startsWith(delimiter, pos)) {
+ pos += delimLen;
+ }
+ }
+ adjustedIndex = totalFields + index;
+ if (adjustedIndex < 0) {
+ return "null";
+ }
+ }
+
+ for (int i = 0; i < adjustedIndex; i++) {
+ int end = input.indexOf(delimiter, start);
+ if (end == -1) {
+ return "null";
+ }
+ start = end + delimLen;
+ // skip consecutive delimiters
+ while (start < len && input.startsWith(delimiter, start)) {
+ start += delimLen;
+ }
+ }
+
+ int end = input.indexOf(delimiter, start);
+ return end == -1 ? input.substring(start) : input.substring(start, end);
+ }
+
+ private static String splitPartNegativeIdxSingleCharDelim(
+ String input, char delimiter, int index, int len, int start) {
+ // input is empty or contains only delimiters
+ if (start == len) {
+ return index == 1 ? "" : "null";
+ }
+
+ // scan backwards and handle trailing delimiters
+ int end = len;
+ while (end > start && input.charAt(end - 1) == delimiter) {
+ end--;
+ }
+
+ // handle trailing delimiters
+ int resultIdx = index;
+ if (end < len) {
+ if (index == 1) {
+ return "";
+ }
+ resultIdx--;
+ }
+
+ int curEnd = end;
+ for (int i = 1; i <= resultIdx; i++) {
+ // handle left out of bound index
+ if (curEnd <= start) {
+ return "null";
+ }
+
+ // scan left until no delimiter
+ int curStart = curEnd - 1;
+ while (curStart >= start && input.charAt(curStart) != delimiter) {
+ curStart--;
+ }
+
+ if (i == resultIdx) {
+ return input.substring(curStart + 1, curEnd);
+ }
+
+ // skip consecutive delimiters
+ curEnd = curStart;
+ while (curEnd > start && input.charAt(curEnd - 1) == delimiter) {
+ curEnd--;
+ }
+ }
+
+ return "null";
+ }
+
+ @VisibleForTesting
+ static String splitPartArrayBased(String[] parts, int index) {
+ if (index >= 0 && index < parts.length) {
+ return parts[index];
+ } else if (index < 0 && index >= -parts.length) {
+ return parts[parts.length + index];
+ }
+ return "null";
}
/**
diff --git
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
index 523c4971485..7950313eab8 100644
---
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
+++
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
@@ -18,6 +18,8 @@
*/
package org.apache.pinot.common.function.scalar;
+import java.util.Random;
+import org.apache.commons.lang3.StringUtils;
import org.testng.annotations.DataProvider;
import org.testng.annotations.Test;
@@ -42,6 +44,8 @@ public class StringFunctionsTest {
{"org.apache.pinot.common.function", ".", 3, 3, "common", "null"},
{"+++++", "+", 0, 100, "", ""},
{"+++++", "+", 1, 100, "null", "null"},
+ {"+++++org++apache++", "", 1, 100, "null", "null"},
+ {"+++++org++apache++", "", 0, 100, "+++++org++apache++",
"+++++org++apache++"},
// note that splitPart will split with limit first, then lookup by
index from START or END.
{"org.apache.pinot.common.function", ".", -1, 100, "function",
"function"},
{"org.apache.pinot.common.function", ".", -10, 100, "null", "null"},
@@ -64,6 +68,89 @@ public class StringFunctionsTest {
{"org.apache.pinot.common.function", ".", -6, 6, "null", "null"},
{"+++++", "+", -1, 100, "", ""},
{"+++++", "+", -2, 100, "null", "null"},
+
+ // Empty delimiter: index=-1 returns input, other negative indices
return "null"
+ {"hello", "", -1, 100, "hello", "hello"},
+ {"hello", "", -2, 100, "null", "null"},
+
+ // Single field with no delimiter present
+ {"abc", ".", 0, 100, "abc", "abc"},
+ {"abc", ".", 1, 100, "null", "null"},
+ {"abc", ".", -1, 100, "abc", "abc"},
+ {"abc", ".", -2, 100, "null", "null"},
+
+ // Input equals delimiter
+ {".", ".", 0, 100, "", ""},
+ {".", ".", 1, 100, "null", "null"},
+ {".", ".", -1, 100, "", ""},
+
+ // Trailing delimiters with content (single-char): exercises
splitPartNegativeIdxSingleCharDelim
+ // trailing-delimiter handling (resultIdx decrement, empty trailing
field)
+ {"org++apache++", "+", 0, 100, "org", "org"},
+ {"org++apache++", "+", 1, 100, "apache", "apache"},
+ {"org++apache++", "+", 2, 100, "", ""},
+ {"org++apache++", "+", 3, 100, "null", "null"},
+ {"org++apache++", "+", -1, 100, "", ""},
+ {"org++apache++", "+", -2, 100, "apache", "apache"},
+ {"org++apache++", "+", -3, 100, "org", "org"},
+ {"org++apache++", "+", -4, 100, "null", "null"},
+
+ // Leading AND trailing delimiters (single-char): exercises backward
scan with
+ // both leading delimiter skip and trailing delimiter adjustment
+ {"++org++apache++", "+", 0, 100, "org", "org"},
+ {"++org++apache++", "+", 1, 100, "apache", "apache"},
+ {"++org++apache++", "+", -1, 100, "", ""},
+ {"++org++apache++", "+", -2, 100, "apache", "apache"},
+ {"++org++apache++", "+", -3, 100, "org", "org"},
+ {"++org++apache++", "+", -4, 100, "null", "null"},
+
+ // Single field surrounded by delimiters
+ {"++abc++", "+", 0, 100, "abc", "abc"},
+ {"++abc++", "+", -1, 100, "", ""},
+ {"++abc++", "+", -2, 100, "abc", "abc"},
+ {"++abc++", "+", -3, 100, "null", "null"},
+
+ // Multi-char delimiter: exercises forward scan and multi-char
negative index
+ // path (totalFields counting + adjustedIndex conversion) which is
separate from
+ // the single-char optimized path
+ {"org::apache::pinot", "::", 0, 100, "org", "org"},
+ {"org::apache::pinot", "::", 1, 100, "apache", "apache"},
+ {"org::apache::pinot", "::", 2, 100, "pinot", "pinot"},
+ {"org::apache::pinot", "::", 3, 100, "null", "null"},
+ {"org::apache::pinot", "::", -1, 100, "pinot", "pinot"},
+ {"org::apache::pinot", "::", -2, 100, "apache", "apache"},
+ {"org::apache::pinot", "::", -3, 100, "org", "org"},
+ {"org::apache::pinot", "::", -4, 100, "null", "null"},
+
+ // Multi-char delimiter with consecutive delimiters: exercises the
consecutive
+ // delimiter skip in both leading-skip loop and the totalFields
counting loop
+ {"::::org::::apache", "::", 0, 100, "org", "org"},
+ {"::::org::::apache", "::", 1, 100, "apache", "apache"},
+ {"::::org::::apache", "::", 2, 100, "null", "null"},
+ {"::::org::::apache", "::", -1, 100, "apache", "apache"},
+ {"::::org::::apache", "::", -2, 100, "org", "org"},
+ {"::::org::::apache", "::", -3, 100, "null", "null"},
+
+ // Multi-char delimiter with leading AND trailing delimiters:
exercises the
+ // trailing empty field in the multi-char totalFields counting path
+ {"::org::apache::", "::", 0, 100, "org", "org"},
+ {"::org::apache::", "::", 1, 100, "apache", "apache"},
+ {"::org::apache::", "::", 2, 100, "", ""},
+ {"::org::apache::", "::", -1, 100, "", ""},
+ {"::org::apache::", "::", -2, 100, "apache", "apache"},
+ {"::org::apache::", "::", -3, 100, "org", "org"},
+ {"::org::apache::", "::", -4, 100, "null", "null"},
+
+ // Empty input with non-empty delimiter
+ {"", ".", 0, 100, "null", "null"},
+ {"", ".", -1, 100, "null", "null"},
+ {"", ".", -2, 100, "null", "null"},
+
+ // Empty input with multi-char delimiter and negative index
+ {"", "::", -1, 100, "null", "null"},
+
+ // Integer.MIN_VALUE: negating it overflows (remains negative), guard
must return "null"
+ {"org.apache.pinot", ".", Integer.MIN_VALUE, 100, "null", "null"},
};
}
@@ -204,6 +291,31 @@ public class StringFunctionsTest {
assertEquals(StringFunctions.splitPart(input, delimiter, limit, index),
expectedTokenWithLimitCounts);
}
+ @Test
+ public void testSplitPartRandomized() {
+ String chars = "abcdefg.,:;+-_/";
+ String[] delimiters = {".", ",", ":", "::", "++", "ab", "///"};
+ Random random = new Random();
+ int numIterations = 10_000;
+
+ for (int iter = 0; iter < numIterations; iter++) {
+ int len = random.nextInt(50);
+ StringBuilder sb = new StringBuilder(len);
+ for (int i = 0; i < len; i++) {
+ sb.append(chars.charAt(random.nextInt(chars.length())));
+ }
+ String input = sb.toString();
+ String delimiter = delimiters[random.nextInt(delimiters.length)];
+ int index = random.nextInt(21) - 10; // range [-10, 10]
+
+ String expected = StringFunctions.splitPartArrayBased(
+ StringUtils.splitByWholeSeparator(input, delimiter), index);
+ String actual = StringFunctions.splitPart(input, delimiter, index);
+ assertEquals(actual, expected,
+ String.format("Mismatch for input='%s', delimiter='%s', index=%d",
input, delimiter, index));
+ }
+ }
+
@Test(dataProvider = "prefixAndSuffixTestCases")
public void testPrefixAndSuffix(String input, int length, String[]
expectedPrefix, String[] expectedSuffix,
String[] expectedPrefixWithRegexChar, String[]
expectedSuffixWithRegexChar) {
diff --git
a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java
new file mode 100644
index 00000000000..a2dce8ca072
--- /dev/null
+++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.perf;
+
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pinot.common.function.scalar.StringFunctions;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Fork(1)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 10, time = 1)
+@State(Scope.Benchmark)
+public class BenchmarkSplitPart {
+
+ @Param({
+ "small_index", "large_index",
+ "negative_index", "large_negative_index",
+ "adjacent_delimiters", "many_fields",
+ "multi_char_delim", "large_multi_char_delim"
+ })
+ public String _scenario;
+
+ private String _input;
+ private String _delimiter;
+ private int _index;
+
+ @Setup
+ public void setUp() {
+ switch (_scenario) {
+ case "small_index":
+ // Extract 2nd element from 100-field string
+ _delimiter = ",";
+ _input = buildString(100, _delimiter);
+ _index = 1;
+ break;
+ case "large_index":
+ // Extract 790th element from 10000-field string
+ _delimiter = ",";
+ _input = buildString(10000, _delimiter);
+ _index = 790;
+ break;
+ case "negative_index":
+ // Extract last element from 100-field string
+ _delimiter = ",";
+ _input = buildString(100, _delimiter);
+ _index = -1;
+ break;
+ case "large_negative_index":
+ // Extract 7242nd-from-last element from 10000-field string
+ _delimiter = ",";
+ _input = buildString(10000, _delimiter);
+ _index = -7242;
+ break;
+ case "adjacent_delimiters":
+ // String with many consecutive delimiters
+ _input = "field0+++field1+++field2+++field3";
+ _delimiter = "+";
+ _index = 1;
+ break;
+ case "many_fields":
+ // Extract 5th element from 10000-field string
+ _delimiter = ",";
+ _input = buildString(10000, _delimiter);
+ _index = 4;
+ break;
+ case "multi_char_delim":
+ // Multi-character delimiter
+ _delimiter = repeatStr(":", 10);
+ _input = buildString(50, _delimiter);
+ _index = 10;
+ break;
+ case "large_multi_char_delim":
+ // Multi-character delimiter
+ _delimiter = repeatStr(":", 4242);
+ _input = buildString(10000, _delimiter);
+ _index = 10;
+ break;
+ default:
+ throw new IllegalArgumentException("Unknown scenario: " + _scenario);
+ }
+ }
+
+ private String repeatStr(String str, int length) {
+ return String.valueOf(str).repeat(Math.max(0, length));
+ }
+
+ private String buildString(int fields, String delimiter) {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < fields; i++) {
+ if (i > 0) {
+ sb.append(delimiter);
+ }
+ sb.append("field").append(i);
+ }
+ return sb.toString();
+ }
+
+ public static void main(String[] args)
+ throws Exception {
+ ChainedOptionsBuilder opt = new
OptionsBuilder().include(BenchmarkSplitPart.class.getSimpleName());
+ new Runner(opt.build()).run();
+ }
+
+ @Benchmark
+ public String testSplitPartOld() {
+ return splitPartOld(_input, _delimiter, _index);
+ }
+
+ @Benchmark
+ public String testSplitPartNew() {
+ return StringFunctions.splitPart(_input, _delimiter, _index);
+ }
+
+ /**
+ * Original implementation that allocates full String array
+ */
+ public static String splitPartOld(String input, String delimiter, int index)
{
+ String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter);
+ if (index >= 0 && index < splitString.length) {
+ return splitString[index];
+ } else if (index < 0 && index >= -splitString.length) {
+ return splitString[splitString.length + index];
+ } else {
+ return "null";
+ }
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]