(pinot) branch master updated: Optimize splitPart scalar function to reduce allocation (#17660)

jackie Thu, 26 Mar 2026 15:28:18 -0700

This is an automated email from the ASF dual-hosted git repository.

jackie pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git



The following commit(s) were added to refs/heads/master by this push:
     new 61009f66dfe Optimize splitPart scalar function to reduce allocation 
(#17660)
61009f66dfe is described below

commit 61009f66dfeb52a69d5762273ea928b72cf834e2
Author: Sai Valla <[email protected]>
AuthorDate: Thu Mar 26 15:28:06 2026 -0700

    Optimize splitPart scalar function to reduce allocation (#17660)
---
 .../common/function/scalar/StringFunctions.java    | 131 +++++++++++++++--
 .../function/scalar/StringFunctionsTest.java       | 112 +++++++++++++++
 .../org/apache/pinot/perf/BenchmarkSplitPart.java  | 160 +++++++++++++++++++++
 3 files changed, 395 insertions(+), 8 deletions(-)

diff --git 
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
 
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
index 03d8d1aa2d0..3127ea31b39 100644
--- 
a/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
+++ 
b/pinot-common/src/main/java/org/apache/pinot/common/function/scalar/StringFunctions.java
@@ -18,6 +18,7 @@
  */
 package org.apache.pinot.common.function.scalar;
 
+import com.google.common.annotations.VisibleForTesting;
 import java.io.UnsupportedEncodingException;
 import java.nio.ByteBuffer;
 import java.nio.charset.StandardCharsets;
@@ -578,21 +579,135 @@ public class StringFunctions {
 
   /**
    * TODO: Revisit if index should be one-based (both Presto and Postgres use 
one-based index, which starts with 1)
-   * @param input
-   * @param delimiter
+   * @param input the input String to be split into parts.
+   * @param delimiter the specified delimiter to split the input string.
    * @param index we allow negative value for index which indicates the index 
from the end.
    * @return splits string on specified delimiter and returns String at 
specified index from the split.
    */
   @ScalarFunction
   public static String splitPart(String input, String delimiter, int index) {
-    String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter);
-    if (index >= 0 && index < splitString.length) {
-      return splitString[index];
-    } else if (index < 0 && index >= -splitString.length) {
-      return splitString[splitString.length + index];
-    } else {
+    // compare with BenchmarkSplitPart (perf module) for future changes
+    int delimLen = delimiter.length();
+    if (delimLen == 0) {
+      // Empty delimiter splits on whitespace (preserving existing behavior 
via Apache Commons)
+      return splitPartArrayBased(StringUtils.splitByWholeSeparator(input, 
delimiter), index);
+    }
+
+    int len = input.length();
+    if (len == 0) {
       return "null";
     }
+
+    // skip leading delimiters
+    int start = 0;
+    while (start < len && input.startsWith(delimiter, start)) {
+      start += delimLen;
+    }
+    // Guard against Integer.MIN_VALUE since negating it overflows (remains 
negative)
+    if (index == Integer.MIN_VALUE) {
+      return "null";
+    }
+    // optimization for negative index with single-char delimiter since common 
case
+    // multi-char delimiter with negative index can be handled in future since 
less common and more complex
+    if (index < 0 && delimLen == 1) {
+      return splitPartNegativeIdxSingleCharDelim(input, delimiter.charAt(0), 
-index, len, start);
+    }
+
+    // convert negative index to positive by counting total fields
+    int adjustedIndex = index;
+    if (adjustedIndex < 0) {
+      int totalFields = 0;
+      int pos = start;
+      while (pos <= len) {
+        totalFields++;
+        int end = input.indexOf(delimiter, pos);
+        if (end == -1) {
+          break;
+        }
+        pos = end + delimLen;
+        while (pos < len && input.startsWith(delimiter, pos)) {
+          pos += delimLen;
+        }
+      }
+      adjustedIndex = totalFields + index;
+      if (adjustedIndex < 0) {
+        return "null";
+      }
+    }
+
+    for (int i = 0; i < adjustedIndex; i++) {
+      int end = input.indexOf(delimiter, start);
+      if (end == -1) {
+        return "null";
+      }
+      start = end + delimLen;
+      // skip consecutive delimiters
+      while (start < len && input.startsWith(delimiter, start)) {
+        start += delimLen;
+      }
+    }
+
+    int end = input.indexOf(delimiter, start);
+    return end == -1 ? input.substring(start) : input.substring(start, end);
+  }
+
+  private static String splitPartNegativeIdxSingleCharDelim(
+      String input, char delimiter, int index, int len, int start) {
+    // input is empty or contains only delimiters
+    if (start == len) {
+      return index == 1 ? "" : "null";
+    }
+
+    // scan backwards and handle trailing delimiters
+    int end = len;
+    while (end > start && input.charAt(end - 1) == delimiter) {
+      end--;
+    }
+
+    // handle trailing delimiters
+    int resultIdx = index;
+    if (end < len) {
+      if (index == 1) {
+        return "";
+      }
+      resultIdx--;
+    }
+
+    int curEnd = end;
+    for (int i = 1; i <= resultIdx; i++) {
+      // handle left out of bound index
+      if (curEnd <= start) {
+        return "null";
+      }
+
+      // scan left until no delimiter
+      int curStart = curEnd - 1;
+      while (curStart >= start && input.charAt(curStart) != delimiter) {
+        curStart--;
+      }
+
+      if (i == resultIdx) {
+        return input.substring(curStart + 1, curEnd);
+      }
+
+      // skip consecutive delimiters
+      curEnd = curStart;
+      while (curEnd > start && input.charAt(curEnd - 1) == delimiter) {
+        curEnd--;
+      }
+    }
+
+    return "null";
+  }
+
+  @VisibleForTesting
+  static String splitPartArrayBased(String[] parts, int index) {
+    if (index >= 0 && index < parts.length) {
+      return parts[index];
+    } else if (index < 0 && index >= -parts.length) {
+      return parts[parts.length + index];
+    }
+    return "null";
   }
 
   /**
diff --git 
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
 
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
index 523c4971485..7950313eab8 100644
--- 
a/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
+++ 
b/pinot-common/src/test/java/org/apache/pinot/common/function/scalar/StringFunctionsTest.java
@@ -18,6 +18,8 @@
  */
 package org.apache.pinot.common.function.scalar;
 
+import java.util.Random;
+import org.apache.commons.lang3.StringUtils;
 import org.testng.annotations.DataProvider;
 import org.testng.annotations.Test;
 
@@ -42,6 +44,8 @@ public class StringFunctionsTest {
         {"org.apache.pinot.common.function", ".", 3, 3, "common", "null"},
         {"+++++", "+", 0, 100, "", ""},
         {"+++++", "+", 1, 100, "null", "null"},
+        {"+++++org++apache++", "", 1, 100, "null", "null"},
+        {"+++++org++apache++", "", 0, 100, "+++++org++apache++", 
"+++++org++apache++"},
         // note that splitPart will split with limit first, then lookup by 
index from START or END.
         {"org.apache.pinot.common.function", ".", -1, 100, "function", 
"function"},
         {"org.apache.pinot.common.function", ".", -10, 100, "null", "null"},
@@ -64,6 +68,89 @@ public class StringFunctionsTest {
         {"org.apache.pinot.common.function", ".", -6, 6, "null", "null"},
         {"+++++", "+", -1, 100, "", ""},
         {"+++++", "+", -2, 100, "null", "null"},
+
+        // Empty delimiter: index=-1 returns input, other negative indices 
return "null"
+        {"hello", "", -1, 100, "hello", "hello"},
+        {"hello", "", -2, 100, "null", "null"},
+
+        // Single field with no delimiter present
+        {"abc", ".", 0, 100, "abc", "abc"},
+        {"abc", ".", 1, 100, "null", "null"},
+        {"abc", ".", -1, 100, "abc", "abc"},
+        {"abc", ".", -2, 100, "null", "null"},
+
+        // Input equals delimiter
+        {".", ".", 0, 100, "", ""},
+        {".", ".", 1, 100, "null", "null"},
+        {".", ".", -1, 100, "", ""},
+
+        // Trailing delimiters with content (single-char): exercises 
splitPartNegativeIdxSingleCharDelim
+        // trailing-delimiter handling (resultIdx decrement, empty trailing 
field)
+        {"org++apache++", "+", 0, 100, "org", "org"},
+        {"org++apache++", "+", 1, 100, "apache", "apache"},
+        {"org++apache++", "+", 2, 100, "", ""},
+        {"org++apache++", "+", 3, 100, "null", "null"},
+        {"org++apache++", "+", -1, 100, "", ""},
+        {"org++apache++", "+", -2, 100, "apache", "apache"},
+        {"org++apache++", "+", -3, 100, "org", "org"},
+        {"org++apache++", "+", -4, 100, "null", "null"},
+
+        // Leading AND trailing delimiters (single-char): exercises backward 
scan with
+        // both leading delimiter skip and trailing delimiter adjustment
+        {"++org++apache++", "+", 0, 100, "org", "org"},
+        {"++org++apache++", "+", 1, 100, "apache", "apache"},
+        {"++org++apache++", "+", -1, 100, "", ""},
+        {"++org++apache++", "+", -2, 100, "apache", "apache"},
+        {"++org++apache++", "+", -3, 100, "org", "org"},
+        {"++org++apache++", "+", -4, 100, "null", "null"},
+
+        // Single field surrounded by delimiters
+        {"++abc++", "+", 0, 100, "abc", "abc"},
+        {"++abc++", "+", -1, 100, "", ""},
+        {"++abc++", "+", -2, 100, "abc", "abc"},
+        {"++abc++", "+", -3, 100, "null", "null"},
+
+        // Multi-char delimiter: exercises forward scan and multi-char 
negative index
+        // path (totalFields counting + adjustedIndex conversion) which is 
separate from
+        // the single-char optimized path
+        {"org::apache::pinot", "::", 0, 100, "org", "org"},
+        {"org::apache::pinot", "::", 1, 100, "apache", "apache"},
+        {"org::apache::pinot", "::", 2, 100, "pinot", "pinot"},
+        {"org::apache::pinot", "::", 3, 100, "null", "null"},
+        {"org::apache::pinot", "::", -1, 100, "pinot", "pinot"},
+        {"org::apache::pinot", "::", -2, 100, "apache", "apache"},
+        {"org::apache::pinot", "::", -3, 100, "org", "org"},
+        {"org::apache::pinot", "::", -4, 100, "null", "null"},
+
+        // Multi-char delimiter with consecutive delimiters: exercises the 
consecutive
+        // delimiter skip in both leading-skip loop and the totalFields 
counting loop
+        {"::::org::::apache", "::", 0, 100, "org", "org"},
+        {"::::org::::apache", "::", 1, 100, "apache", "apache"},
+        {"::::org::::apache", "::", 2, 100, "null", "null"},
+        {"::::org::::apache", "::", -1, 100, "apache", "apache"},
+        {"::::org::::apache", "::", -2, 100, "org", "org"},
+        {"::::org::::apache", "::", -3, 100, "null", "null"},
+
+        // Multi-char delimiter with leading AND trailing delimiters: 
exercises the
+        // trailing empty field in the multi-char totalFields counting path
+        {"::org::apache::", "::", 0, 100, "org", "org"},
+        {"::org::apache::", "::", 1, 100, "apache", "apache"},
+        {"::org::apache::", "::", 2, 100, "", ""},
+        {"::org::apache::", "::", -1, 100, "", ""},
+        {"::org::apache::", "::", -2, 100, "apache", "apache"},
+        {"::org::apache::", "::", -3, 100, "org", "org"},
+        {"::org::apache::", "::", -4, 100, "null", "null"},
+
+        // Empty input with non-empty delimiter
+        {"", ".", 0, 100, "null", "null"},
+        {"", ".", -1, 100, "null", "null"},
+        {"", ".", -2, 100, "null", "null"},
+
+        // Empty input with multi-char delimiter and negative index
+        {"", "::", -1, 100, "null", "null"},
+
+        // Integer.MIN_VALUE: negating it overflows (remains negative), guard 
must return "null"
+        {"org.apache.pinot", ".", Integer.MIN_VALUE, 100, "null", "null"},
     };
   }
 
@@ -204,6 +291,31 @@ public class StringFunctionsTest {
     assertEquals(StringFunctions.splitPart(input, delimiter, limit, index), 
expectedTokenWithLimitCounts);
   }
 
+  @Test
+  public void testSplitPartRandomized() {
+    String chars = "abcdefg.,:;+-_/";
+    String[] delimiters = {".", ",", ":", "::", "++", "ab", "///"};
+    Random random = new Random();
+    int numIterations = 10_000;
+
+    for (int iter = 0; iter < numIterations; iter++) {
+      int len = random.nextInt(50);
+      StringBuilder sb = new StringBuilder(len);
+      for (int i = 0; i < len; i++) {
+        sb.append(chars.charAt(random.nextInt(chars.length())));
+      }
+      String input = sb.toString();
+      String delimiter = delimiters[random.nextInt(delimiters.length)];
+      int index = random.nextInt(21) - 10; // range [-10, 10]
+
+      String expected = StringFunctions.splitPartArrayBased(
+          StringUtils.splitByWholeSeparator(input, delimiter), index);
+      String actual = StringFunctions.splitPart(input, delimiter, index);
+      assertEquals(actual, expected,
+          String.format("Mismatch for input='%s', delimiter='%s', index=%d", 
input, delimiter, index));
+    }
+  }
+
   @Test(dataProvider = "prefixAndSuffixTestCases")
   public void testPrefixAndSuffix(String input, int length, String[] 
expectedPrefix, String[] expectedSuffix,
       String[] expectedPrefixWithRegexChar, String[] 
expectedSuffixWithRegexChar) {
diff --git 
a/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java 
b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java
new file mode 100644
index 00000000000..a2dce8ca072
--- /dev/null
+++ b/pinot-perf/src/main/java/org/apache/pinot/perf/BenchmarkSplitPart.java
@@ -0,0 +1,160 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.pinot.perf;
+
+import java.util.concurrent.TimeUnit;
+import org.apache.commons.lang3.StringUtils;
+import org.apache.pinot.common.function.scalar.StringFunctions;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.ChainedOptionsBuilder;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+
+@BenchmarkMode(Mode.AverageTime)
+@OutputTimeUnit(TimeUnit.NANOSECONDS)
+@Fork(1)
+@Warmup(iterations = 5, time = 1)
+@Measurement(iterations = 10, time = 1)
+@State(Scope.Benchmark)
+public class BenchmarkSplitPart {
+
+  @Param({
+      "small_index", "large_index",
+      "negative_index", "large_negative_index",
+      "adjacent_delimiters", "many_fields",
+      "multi_char_delim", "large_multi_char_delim"
+  })
+  public String _scenario;
+
+  private String _input;
+  private String _delimiter;
+  private int _index;
+
+  @Setup
+  public void setUp() {
+    switch (_scenario) {
+      case "small_index":
+        // Extract 2nd element from 100-field string
+        _delimiter = ",";
+        _input = buildString(100, _delimiter);
+        _index = 1;
+        break;
+      case "large_index":
+        // Extract 790th element from 10000-field string
+        _delimiter = ",";
+        _input = buildString(10000, _delimiter);
+        _index = 790;
+        break;
+      case "negative_index":
+        // Extract last element from 100-field string
+        _delimiter = ",";
+        _input = buildString(100, _delimiter);
+        _index = -1;
+        break;
+      case "large_negative_index":
+        // Extract 7242nd-from-last element from 10000-field string
+        _delimiter = ",";
+        _input = buildString(10000, _delimiter);
+        _index = -7242;
+        break;
+      case "adjacent_delimiters":
+        // String with many consecutive delimiters
+        _input = "field0+++field1+++field2+++field3";
+        _delimiter = "+";
+        _index = 1;
+        break;
+      case "many_fields":
+        // Extract 5th element from 10000-field string
+        _delimiter = ",";
+        _input = buildString(10000, _delimiter);
+        _index = 4;
+        break;
+      case "multi_char_delim":
+        // Multi-character delimiter
+        _delimiter = repeatStr(":", 10);
+        _input = buildString(50, _delimiter);
+        _index = 10;
+        break;
+      case "large_multi_char_delim":
+        // Multi-character delimiter
+        _delimiter = repeatStr(":", 4242);
+        _input = buildString(10000, _delimiter);
+        _index = 10;
+        break;
+      default:
+        throw new IllegalArgumentException("Unknown scenario: " + _scenario);
+    }
+  }
+
+  private String repeatStr(String str, int length) {
+    return String.valueOf(str).repeat(Math.max(0, length));
+  }
+
+  private String buildString(int fields, String delimiter) {
+    StringBuilder sb = new StringBuilder();
+    for (int i = 0; i < fields; i++) {
+      if (i > 0) {
+        sb.append(delimiter);
+      }
+      sb.append("field").append(i);
+    }
+    return sb.toString();
+  }
+
+  public static void main(String[] args)
+      throws Exception {
+    ChainedOptionsBuilder opt = new 
OptionsBuilder().include(BenchmarkSplitPart.class.getSimpleName());
+    new Runner(opt.build()).run();
+  }
+
+  @Benchmark
+  public String testSplitPartOld() {
+    return splitPartOld(_input, _delimiter, _index);
+  }
+
+  @Benchmark
+  public String testSplitPartNew() {
+    return StringFunctions.splitPart(_input, _delimiter, _index);
+  }
+
+  /**
+   * Original implementation that allocates full String array
+   */
+  public static String splitPartOld(String input, String delimiter, int index) 
{
+    String[] splitString = StringUtils.splitByWholeSeparator(input, delimiter);
+    if (index >= 0 && index < splitString.length) {
+      return splitString[index];
+    } else if (index < 0 && index >= -splitString.length) {
+      return splitString[splitString.length + index];
+    } else {
+      return "null";
+    }
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(pinot) branch master updated: Optimize splitPart scalar function to reduce allocation (#17660)

Reply via email to