[L] Change in asterixdb[phoenix]: [NO ISSUE][HYR][RT] Improve UTF8 string matching

AsterixDB Code Review Thu, 09 Oct 2025 16:08:42 -0700

>From Michael Blow <[email protected]>:

Michael Blow has uploaded this change for review. ( 
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20477?usp=email )



Change subject: [NO ISSUE][HYR][RT] Improve UTF8 string matching
......................................................................

[NO ISSUE][HYR][RT] Improve UTF8 string matching

Improve string matching for longer strings by utilizing
KMP for longer strings

Ext-ref: MB-68178
Change-Id: I839a274ed9dd93ce7bd780a2946621f0631339a1
---
M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
M 
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
A 
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
3 files changed, 387 insertions(+), 1 deletion(-)



  git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb 
refs/changes/77/20477/1

diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
index 65a0793..8d242ba 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
@@ -66,5 +66,11 @@
       <groupId>it.unimi.dsi</groupId>
       <artifactId>fastutil</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.openjdk.jmh</groupId>
+      <artifactId>jmh-core</artifactId>
+      <version>1.37</version>
+      <scope>test</scope>
+    </dependency>
   </dependencies>
 </project>
diff --git 
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3744350..419a753 100644
--- 
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++ 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -48,6 +48,8 @@

     public static final UTF8StringPointableFactory FACTORY = new 
UTF8StringPointableFactory();
     public static final ITypeTraits TYPE_TRAITS = VarLengthTypeTrait.INSTANCE;
+    // use KMP only if pattern has at least this many bytes
+    private static final int KMP_THRESHOLD_BYTES = 12;
     // These values are cached to speed up the length data access.
     // Since we are using the variable-length encoding, we can save the 
repeated decoding efforts.
     // WARNING: must call the resetConstants() method after each reset().
@@ -272,9 +274,152 @@
         return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch, 
false);
     }

-    // If resultInByte is true, then return the position in bytes, otherwise 
return the position in code points
+    // Public entry if you want to expose it; otherwise call internally
+    static int findKMP(UTF8StringPointable src, UTF8StringPointable pattern, 
boolean ignoreCase, int startMatch,
+            boolean resultInByte) throws HyracksDataException {
+        if (startMatch < 0) {
+            startMatch = 0;
+        }
+        final int pUtfLen = pattern.getUTF8Length();
+        if (pUtfLen == 0) {
+            return 0;
+        }
+        final int sUtfLen = src.getUTF8Length();
+        if (startMatch > sUtfLen - pUtfLen) {
+            return -1;
+        }
+
+        final int sMeta = src.getMetaDataLength();
+        final int pMeta = pattern.getMetaDataLength();
+
+        // 1. Extract pattern code points
+        IntArrayBuilder pCps = new IntArrayBuilder();
+        int pByte = 0;
+        while (pByte < pUtfLen) {
+            int cp = pattern.codePointAt(pMeta + pByte);
+            if (ignoreCase && cp <= 0xFFFF) {
+                cp = Character.toLowerCase((char) cp); // matches existing 
semantics
+            }
+            pCps.add(cp);
+            pByte += pattern.codePointSize(pMeta + pByte);
+        }
+        int m = pCps.size();
+        if (m == 0) {
+            return 0;
+        }
+
+        // 2. Build LPS array
+        int[] lps = new int[m];
+        for (int len = 0, i = 1; i < m;) {
+            if (pCps.get(i) == pCps.get(len)) {
+                lps[i++] = ++len;
+            } else if (len != 0) {
+                len = lps[len - 1];
+            } else {
+                lps[i++] = 0;
+            }
+        }
+
+        // 3. Scan source
+        int iByte = startMatch;
+        int codePointIndex = 0; // code point index from startMatch (only 
needed if resultInByte == false)
+        int globalCPIndex = 0; // absolute code point index (to compute result)
+        // Pre-skip to startMatch in code points if needed for code point 
result
+        if (!resultInByte && startMatch != 0) {
+            int tmpByte = 0;
+            while (tmpByte < startMatch) {
+                tmpByte += src.codePointSize(sMeta + tmpByte);
+                globalCPIndex++;
+            }
+        }
+
+        int j = 0;
+        int matchStartByte = -1;
+        int matchStartCodePoint = -1;
+
+        while (iByte < sUtfLen) {
+            int cp = src.codePointAt(sMeta + iByte);
+            int lowered = cp;
+            if (ignoreCase && cp <= 0xFFFF) {
+                lowered = Character.toLowerCase((char) cp);
+            }
+            // KMP core
+            while (j > 0 && lowered != pCps.get(j)) {
+                j = lps[j - 1];
+                if (j == 0) {
+                    matchStartByte = -1;
+                    matchStartCodePoint = -1;
+                }
+            }
+            if (lowered == pCps.get(j)) {
+                if (j == 0) {
+                    matchStartByte = iByte;
+                    matchStartCodePoint = globalCPIndex;
+                }
+                j++;
+                if (j == m) {
+                    return resultInByte ? matchStartByte : matchStartCodePoint;
+                }
+            } else {
+                // mismatch at j == 0 -> nothing recorded
+            }
+
+            int consumed = src.codePointSize(sMeta + iByte);
+            iByte += consumed;
+            globalCPIndex++;
+            if (!resultInByte) {
+                codePointIndex++;
+            }
+
+            // Early stop: remaining bytes < pattern bytes
+            if (sUtfLen - iByte < pUtfLen) {
+                break;
+            }
+        }
+        return -1;
+    }
+
+    // Lightweight dynamic int array (avoid extra dependencies)
+    private static final class IntArrayBuilder {
+        private int[] a = new int[16];
+        private int size = 0;
+
+        void add(int v) {
+            if (size == a.length) {
+                int[] n = new int[a.length << 1];
+                System.arraycopy(a, 0, n, 0, size);
+                a = n;
+            }
+            a[size++] = v;
+        }
+
+        int get(int i) {
+            return a[i];
+        }
+
+        int size() {
+            return size;
+        }
+    }
+
     private static int findInByteOrCodePoint(UTF8StringPointable src, 
UTF8StringPointable pattern, boolean ignoreCase,
             int startMatch, boolean resultInByte) throws HyracksDataException {
+        int pUtfLen = pattern.getUTF8Length();
+        if (pUtfLen == 0) {
+            return 0;
+        }
+
+        // Simple byte-based threshold for performance
+        if (pUtfLen >= KMP_THRESHOLD_BYTES) {
+            return findKMP(src, pattern, ignoreCase, startMatch, resultInByte);
+        } else {
+            return findLegacy(src, pattern, ignoreCase, startMatch, 
resultInByte);
+        }
+    }
+
+    // Extract original body into legacyFind to keep code compact
+    static int findLegacy(UTF8StringPointable src, UTF8StringPointable 
pattern, boolean ignoreCase, int startMatch,
+            boolean resultInByte) throws HyracksDataException {
         int startMatchPos = startMatch;
         final int srcUtfLen = src.getUTF8Length();
         final int pttnUtfLen = pattern.getUTF8Length();
diff --git 
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
new file mode 100644
index 0000000..a95cb2c
--- /dev/null
+++ 
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.data.std.primitive;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Warmup(iterations = 3, time = 500, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 5, time = 800, timeUnit = TimeUnit.MILLISECONDS)
+@Fork(1)
+@State(Scope.Thread)
+public class UTF8StringPointableBenchmark {
+    @Param({ "ASCII", "CJK", "MIXED" })
+    public String corpusType;
+
+    @Param({ "SHORT", "THRESHOLD", "MEDIUM", "LARGE" })
+    public String patternSize;
+
+    @Param({ "BEGIN", "MIDDLE", "END", "ABSENT" })
+    public String position;
+
+    @Param({ "false" })
+    public boolean ignoreCase;
+
+    private UTF8StringPointable src;
+    private UTF8StringPointable pattern;
+    private int expectedLegacy;
+    private boolean expectFound;
+
+    @Setup
+    public void setup() throws HyracksDataException {
+        String corpus = buildCorpus(corpusType, 8192);
+        String pat = buildPattern(patternSize, corpusType);
+
+        int insertPos = computeInsertOffset(corpus.length(), pat.length(), 
position);
+        if ("ABSENT".equals(position)) {
+            // Ensure absence
+            while (corpus.contains(pat)) {
+                pat = pat + "_";
+            }
+            expectFound = false;
+        } else {
+            expectFound = true;
+            corpus = corpus.substring(0, insertPos) + pat + 
corpus.substring(insertPos);
+        }
+
+        src = UTF8StringPointable.generateUTF8Pointable(corpus);
+        pattern = UTF8StringPointable.generateUTF8Pointable(pat);
+
+        if (expectFound) {
+            expectedLegacy = UTF8StringPointable.findLegacy(src, pattern, 
ignoreCase, 0, true);
+            if (expectedLegacy < 0) {
+                throw new IllegalStateException("Legacy failed to find 
expected pattern");
+            }
+        } else {
+            expectedLegacy = -1;
+        }
+    }
+
+    @Benchmark
+    public void legacyFind(Blackhole bh) throws HyracksDataException {
+        int r = UTF8StringPointable.findLegacy(src, pattern, ignoreCase, 0, 
true);
+        if (expectFound && r != expectedLegacy) {
+            throw new AssertionError("legacy mismatch");
+        }
+        bh.consume(r);
+    }
+
+    @Benchmark
+    public void kmpFind(Blackhole bh) throws HyracksDataException {
+        int r = UTF8StringPointable.findKMP(src, pattern, ignoreCase, 0, true);
+        if (expectFound && r != expectedLegacy) {
+            throw new AssertionError("kmp mismatch");
+        }
+        bh.consume(r);
+    }
+    /*
+    @Benchmark
+    public void heuristicFind(Blackhole bh) throws HyracksDataException {
+        int r = UTF8StringPointable.find(src, pattern, ignoreCase);
+        if (expectFound && r != expectedLegacy) {
+            throw new AssertionError("heuristic mismatch");
+        }
+        bh.consume(r);
+    }
+    */
+    // Helpers
+
+    private static String buildCorpus(String type, int targetLen) {
+        String ascii = 
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+        String cjk = "漢字仮名交じり文日本語文章例外参照追加測試數據";
+        String mixed = "abc漢字DEFéñΩЖ中🙂";
+
+        String alphabet;
+        switch (type) {
+            case "ASCII":
+                alphabet = ascii;
+                break;
+            case "CJK":
+                alphabet = cjk;
+                break;
+            default:
+                alphabet = mixed;
+        }
+        StringBuilder sb = new StringBuilder(targetLen + 32);
+        int i = 0;
+        while (sb.length() < targetLen) {
+            sb.append(alphabet.charAt(i % alphabet.length()));
+            i++;
+        }
+        return sb.toString();
+    }
+
+    private static String buildPattern(String sizeKind, String corpusType) {
+        int targetBytes;
+        switch (sizeKind) {
+            case "SHORT":
+                targetBytes = 6;
+                break;
+            case "THRESHOLD":
+                targetBytes = 12;
+                break;
+            case "MEDIUM":
+                targetBytes = 48;
+                break;
+            default:
+                targetBytes = 192; // LARGE
+        }
+
+        String base;
+        if ("CJK".equals(corpusType)) {
+            base = "漢字測試資料例外";
+        } else if ("ASCII".equals(corpusType)) {
+            base = "PatternValueForBenchMarking";
+        } else {
+            base = "Mix漢AΩZéß";
+        }
+
+        StringBuilder sb = new StringBuilder();
+        int charIdx = 0; // Index into the base string as char units
+
+        while (true) {
+            // Safely get the next code point, wrapping around the base string
+            if (charIdx >= base.length()) {
+                charIdx = 0;
+            }
+
+            int cp = base.codePointAt(charIdx);
+
+            // Calculate UTF-8 byte length for this code point
+            int cpByteLen;
+            if (cp <= 0x7F) {
+                cpByteLen = 1;
+            } else if (cp <= 0x7FF) {
+                cpByteLen = 2;
+            } else if (cp <= 0xFFFF) {
+                cpByteLen = 3;
+            } else {
+                cpByteLen = 4;
+            }
+
+            // Check if adding this code point would exceed target bytes
+            String current = sb.toString();
+            int currentBytes = 
current.getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
+            if (currentBytes + cpByteLen > targetBytes) {
+                break;
+            }
+
+            sb.appendCodePoint(cp);
+
+            // Advance by the number of char units this code point occupies
+            charIdx += Character.charCount(cp);
+        }
+
+        // Ensure we have at least one character
+        if (sb.isEmpty()) {
+            sb.append('a');
+        }
+
+        return sb.toString();
+    }
+
+    private static int computeInsertOffset(int corpusChars, int patChars, 
String pos) {
+        switch (pos) {
+            case "BEGIN":
+                return 0;
+            case "MIDDLE":
+                return Math.max(0, (corpusChars - patChars) / 2);
+            case "END":
+                return Math.max(0, corpusChars - patChars);
+            default:
+                return -1; // ABSENT
+        }
+    }
+
+    // Main for ad-hoc execution
+    public static void main(String[] args) throws Exception {
+        Options opt = new 
OptionsBuilder().include(UTF8StringPointableBenchmark.class.getSimpleName()).forks(1).build();
+        new Runner(opt).run();
+    }
+}

--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20477?usp=email
To unsubscribe, or for help writing mail filters, visit 
https://asterix-gerrit.ics.uci.edu/settings?usp=email

Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: phoenix
Gerrit-Change-Id: I839a274ed9dd93ce7bd780a2946621f0631339a1
Gerrit-Change-Number: 20477
Gerrit-PatchSet: 1
Gerrit-Owner: Michael Blow <[email protected]>

[L] Change in asterixdb[phoenix]: [NO ISSUE][HYR][RT] Improve UTF8 string matching

Reply via email to