>From Michael Blow <[email protected]>:
Michael Blow has uploaded this change for review. (
https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20477?usp=email )
Change subject: [NO ISSUE][HYR][RT] Improve UTF8 string matching
......................................................................
[NO ISSUE][HYR][RT] Improve UTF8 string matching
Improve string matching for longer strings by utilizing
KMP for longer strings
Ext-ref: MB-68178
Change-Id: I839a274ed9dd93ce7bd780a2946621f0631339a1
---
M hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
M
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
A
hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
3 files changed, 387 insertions(+), 1 deletion(-)
git pull ssh://asterix-gerrit.ics.uci.edu:29418/asterixdb
refs/changes/77/20477/1
diff --git a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
index 65a0793..8d242ba 100644
--- a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
+++ b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/pom.xml
@@ -66,5 +66,11 @@
<groupId>it.unimi.dsi</groupId>
<artifactId>fastutil</artifactId>
</dependency>
+ <dependency>
+ <groupId>org.openjdk.jmh</groupId>
+ <artifactId>jmh-core</artifactId>
+ <version>1.37</version>
+ <scope>test</scope>
+ </dependency>
</dependencies>
</project>
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
index 3744350..419a753 100644
---
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/main/java/org/apache/hyracks/data/std/primitive/UTF8StringPointable.java
@@ -48,6 +48,8 @@
public static final UTF8StringPointableFactory FACTORY = new
UTF8StringPointableFactory();
public static final ITypeTraits TYPE_TRAITS = VarLengthTypeTrait.INSTANCE;
+ // use KMP only if pattern has at least this many bytes
+ private static final int KMP_THRESHOLD_BYTES = 12;
// These values are cached to speed up the length data access.
// Since we are using the variable-length encoding, we can save the
repeated decoding efforts.
// WARNING: must call the resetConstants() method after each reset().
@@ -272,9 +274,152 @@
return findInByteOrCodePoint(src, pattern, ignoreCase, startMatch,
false);
}
- // If resultInByte is true, then return the position in bytes, otherwise
return the position in code points
+ // Public entry if you want to expose it; otherwise call internally
+ static int findKMP(UTF8StringPointable src, UTF8StringPointable pattern,
boolean ignoreCase, int startMatch,
+ boolean resultInByte) throws HyracksDataException {
+ if (startMatch < 0) {
+ startMatch = 0;
+ }
+ final int pUtfLen = pattern.getUTF8Length();
+ if (pUtfLen == 0) {
+ return 0;
+ }
+ final int sUtfLen = src.getUTF8Length();
+ if (startMatch > sUtfLen - pUtfLen) {
+ return -1;
+ }
+
+ final int sMeta = src.getMetaDataLength();
+ final int pMeta = pattern.getMetaDataLength();
+
+ // 1. Extract pattern code points
+ IntArrayBuilder pCps = new IntArrayBuilder();
+ int pByte = 0;
+ while (pByte < pUtfLen) {
+ int cp = pattern.codePointAt(pMeta + pByte);
+ if (ignoreCase && cp <= 0xFFFF) {
+ cp = Character.toLowerCase((char) cp); // matches existing
semantics
+ }
+ pCps.add(cp);
+ pByte += pattern.codePointSize(pMeta + pByte);
+ }
+ int m = pCps.size();
+ if (m == 0) {
+ return 0;
+ }
+
+ // 2. Build LPS array
+ int[] lps = new int[m];
+ for (int len = 0, i = 1; i < m;) {
+ if (pCps.get(i) == pCps.get(len)) {
+ lps[i++] = ++len;
+ } else if (len != 0) {
+ len = lps[len - 1];
+ } else {
+ lps[i++] = 0;
+ }
+ }
+
+ // 3. Scan source
+ int iByte = startMatch;
+ int codePointIndex = 0; // code point index from startMatch (only
needed if resultInByte == false)
+ int globalCPIndex = 0; // absolute code point index (to compute result)
+ // Pre-skip to startMatch in code points if needed for code point
result
+ if (!resultInByte && startMatch != 0) {
+ int tmpByte = 0;
+ while (tmpByte < startMatch) {
+ tmpByte += src.codePointSize(sMeta + tmpByte);
+ globalCPIndex++;
+ }
+ }
+
+ int j = 0;
+ int matchStartByte = -1;
+ int matchStartCodePoint = -1;
+
+ while (iByte < sUtfLen) {
+ int cp = src.codePointAt(sMeta + iByte);
+ int lowered = cp;
+ if (ignoreCase && cp <= 0xFFFF) {
+ lowered = Character.toLowerCase((char) cp);
+ }
+ // KMP core
+ while (j > 0 && lowered != pCps.get(j)) {
+ j = lps[j - 1];
+ if (j == 0) {
+ matchStartByte = -1;
+ matchStartCodePoint = -1;
+ }
+ }
+ if (lowered == pCps.get(j)) {
+ if (j == 0) {
+ matchStartByte = iByte;
+ matchStartCodePoint = globalCPIndex;
+ }
+ j++;
+ if (j == m) {
+ return resultInByte ? matchStartByte : matchStartCodePoint;
+ }
+ } else {
+ // mismatch at j == 0 -> nothing recorded
+ }
+
+ int consumed = src.codePointSize(sMeta + iByte);
+ iByte += consumed;
+ globalCPIndex++;
+ if (!resultInByte) {
+ codePointIndex++;
+ }
+
+ // Early stop: remaining bytes < pattern bytes
+ if (sUtfLen - iByte < pUtfLen) {
+ break;
+ }
+ }
+ return -1;
+ }
+
+ // Lightweight dynamic int array (avoid extra dependencies)
+ private static final class IntArrayBuilder {
+ private int[] a = new int[16];
+ private int size = 0;
+
+ void add(int v) {
+ if (size == a.length) {
+ int[] n = new int[a.length << 1];
+ System.arraycopy(a, 0, n, 0, size);
+ a = n;
+ }
+ a[size++] = v;
+ }
+
+ int get(int i) {
+ return a[i];
+ }
+
+ int size() {
+ return size;
+ }
+ }
+
private static int findInByteOrCodePoint(UTF8StringPointable src,
UTF8StringPointable pattern, boolean ignoreCase,
int startMatch, boolean resultInByte) throws HyracksDataException {
+ int pUtfLen = pattern.getUTF8Length();
+ if (pUtfLen == 0) {
+ return 0;
+ }
+
+ // Simple byte-based threshold for performance
+ if (pUtfLen >= KMP_THRESHOLD_BYTES) {
+ return findKMP(src, pattern, ignoreCase, startMatch, resultInByte);
+ } else {
+ return findLegacy(src, pattern, ignoreCase, startMatch,
resultInByte);
+ }
+ }
+
+ // Extract original body into legacyFind to keep code compact
+ static int findLegacy(UTF8StringPointable src, UTF8StringPointable
pattern, boolean ignoreCase, int startMatch,
+ boolean resultInByte) throws HyracksDataException {
int startMatchPos = startMatch;
final int srcUtfLen = src.getUTF8Length();
final int pttnUtfLen = pattern.getUTF8Length();
diff --git
a/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
new file mode 100644
index 0000000..a95cb2c
--- /dev/null
+++
b/hyracks-fullstack/hyracks/hyracks-data/hyracks-data-std/src/test/java/org/apache/hyracks/data/std/primitive/UTF8StringPointableBenchmark.java
@@ -0,0 +1,235 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.hyracks.data.std.primitive;
+
+import java.util.concurrent.TimeUnit;
+
+import org.apache.hyracks.api.exceptions.HyracksDataException;
+import org.openjdk.jmh.annotations.Benchmark;
+import org.openjdk.jmh.annotations.BenchmarkMode;
+import org.openjdk.jmh.annotations.Fork;
+import org.openjdk.jmh.annotations.Measurement;
+import org.openjdk.jmh.annotations.Mode;
+import org.openjdk.jmh.annotations.OutputTimeUnit;
+import org.openjdk.jmh.annotations.Param;
+import org.openjdk.jmh.annotations.Scope;
+import org.openjdk.jmh.annotations.Setup;
+import org.openjdk.jmh.annotations.State;
+import org.openjdk.jmh.annotations.Warmup;
+import org.openjdk.jmh.infra.Blackhole;
+import org.openjdk.jmh.runner.Runner;
+import org.openjdk.jmh.runner.options.Options;
+import org.openjdk.jmh.runner.options.OptionsBuilder;
+
+@BenchmarkMode(Mode.Throughput)
+@OutputTimeUnit(TimeUnit.SECONDS)
+@Warmup(iterations = 3, time = 500, timeUnit = TimeUnit.MILLISECONDS)
+@Measurement(iterations = 5, time = 800, timeUnit = TimeUnit.MILLISECONDS)
+@Fork(1)
+@State(Scope.Thread)
+public class UTF8StringPointableBenchmark {
+ @Param({ "ASCII", "CJK", "MIXED" })
+ public String corpusType;
+
+ @Param({ "SHORT", "THRESHOLD", "MEDIUM", "LARGE" })
+ public String patternSize;
+
+ @Param({ "BEGIN", "MIDDLE", "END", "ABSENT" })
+ public String position;
+
+ @Param({ "false" })
+ public boolean ignoreCase;
+
+ private UTF8StringPointable src;
+ private UTF8StringPointable pattern;
+ private int expectedLegacy;
+ private boolean expectFound;
+
+ @Setup
+ public void setup() throws HyracksDataException {
+ String corpus = buildCorpus(corpusType, 8192);
+ String pat = buildPattern(patternSize, corpusType);
+
+ int insertPos = computeInsertOffset(corpus.length(), pat.length(),
position);
+ if ("ABSENT".equals(position)) {
+ // Ensure absence
+ while (corpus.contains(pat)) {
+ pat = pat + "_";
+ }
+ expectFound = false;
+ } else {
+ expectFound = true;
+ corpus = corpus.substring(0, insertPos) + pat +
corpus.substring(insertPos);
+ }
+
+ src = UTF8StringPointable.generateUTF8Pointable(corpus);
+ pattern = UTF8StringPointable.generateUTF8Pointable(pat);
+
+ if (expectFound) {
+ expectedLegacy = UTF8StringPointable.findLegacy(src, pattern,
ignoreCase, 0, true);
+ if (expectedLegacy < 0) {
+ throw new IllegalStateException("Legacy failed to find
expected pattern");
+ }
+ } else {
+ expectedLegacy = -1;
+ }
+ }
+
+ @Benchmark
+ public void legacyFind(Blackhole bh) throws HyracksDataException {
+ int r = UTF8StringPointable.findLegacy(src, pattern, ignoreCase, 0,
true);
+ if (expectFound && r != expectedLegacy) {
+ throw new AssertionError("legacy mismatch");
+ }
+ bh.consume(r);
+ }
+
+ @Benchmark
+ public void kmpFind(Blackhole bh) throws HyracksDataException {
+ int r = UTF8StringPointable.findKMP(src, pattern, ignoreCase, 0, true);
+ if (expectFound && r != expectedLegacy) {
+ throw new AssertionError("kmp mismatch");
+ }
+ bh.consume(r);
+ }
+ /*
+ @Benchmark
+ public void heuristicFind(Blackhole bh) throws HyracksDataException {
+ int r = UTF8StringPointable.find(src, pattern, ignoreCase);
+ if (expectFound && r != expectedLegacy) {
+ throw new AssertionError("heuristic mismatch");
+ }
+ bh.consume(r);
+ }
+ */
+ // Helpers
+
+ private static String buildCorpus(String type, int targetLen) {
+ String ascii =
"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789 ";
+ String cjk = "漢字仮名交じり文日本語文章例外参照追加測試數據";
+ String mixed = "abc漢字DEFéñΩЖ中🙂";
+
+ String alphabet;
+ switch (type) {
+ case "ASCII":
+ alphabet = ascii;
+ break;
+ case "CJK":
+ alphabet = cjk;
+ break;
+ default:
+ alphabet = mixed;
+ }
+ StringBuilder sb = new StringBuilder(targetLen + 32);
+ int i = 0;
+ while (sb.length() < targetLen) {
+ sb.append(alphabet.charAt(i % alphabet.length()));
+ i++;
+ }
+ return sb.toString();
+ }
+
+ private static String buildPattern(String sizeKind, String corpusType) {
+ int targetBytes;
+ switch (sizeKind) {
+ case "SHORT":
+ targetBytes = 6;
+ break;
+ case "THRESHOLD":
+ targetBytes = 12;
+ break;
+ case "MEDIUM":
+ targetBytes = 48;
+ break;
+ default:
+ targetBytes = 192; // LARGE
+ }
+
+ String base;
+ if ("CJK".equals(corpusType)) {
+ base = "漢字測試資料例外";
+ } else if ("ASCII".equals(corpusType)) {
+ base = "PatternValueForBenchMarking";
+ } else {
+ base = "Mix漢AΩZéß";
+ }
+
+ StringBuilder sb = new StringBuilder();
+ int charIdx = 0; // Index into the base string as char units
+
+ while (true) {
+ // Safely get the next code point, wrapping around the base string
+ if (charIdx >= base.length()) {
+ charIdx = 0;
+ }
+
+ int cp = base.codePointAt(charIdx);
+
+ // Calculate UTF-8 byte length for this code point
+ int cpByteLen;
+ if (cp <= 0x7F) {
+ cpByteLen = 1;
+ } else if (cp <= 0x7FF) {
+ cpByteLen = 2;
+ } else if (cp <= 0xFFFF) {
+ cpByteLen = 3;
+ } else {
+ cpByteLen = 4;
+ }
+
+ // Check if adding this code point would exceed target bytes
+ String current = sb.toString();
+ int currentBytes =
current.getBytes(java.nio.charset.StandardCharsets.UTF_8).length;
+ if (currentBytes + cpByteLen > targetBytes) {
+ break;
+ }
+
+ sb.appendCodePoint(cp);
+
+ // Advance by the number of char units this code point occupies
+ charIdx += Character.charCount(cp);
+ }
+
+ // Ensure we have at least one character
+ if (sb.isEmpty()) {
+ sb.append('a');
+ }
+
+ return sb.toString();
+ }
+
+ private static int computeInsertOffset(int corpusChars, int patChars,
String pos) {
+ switch (pos) {
+ case "BEGIN":
+ return 0;
+ case "MIDDLE":
+ return Math.max(0, (corpusChars - patChars) / 2);
+ case "END":
+ return Math.max(0, corpusChars - patChars);
+ default:
+ return -1; // ABSENT
+ }
+ }
+
+ // Main for ad-hoc execution
+ public static void main(String[] args) throws Exception {
+ Options opt = new
OptionsBuilder().include(UTF8StringPointableBenchmark.class.getSimpleName()).forks(1).build();
+ new Runner(opt).run();
+ }
+}
--
To view, visit https://asterix-gerrit.ics.uci.edu/c/asterixdb/+/20477?usp=email
To unsubscribe, or for help writing mail filters, visit
https://asterix-gerrit.ics.uci.edu/settings?usp=email
Gerrit-MessageType: newchange
Gerrit-Project: asterixdb
Gerrit-Branch: phoenix
Gerrit-Change-Id: I839a274ed9dd93ce7bd780a2946621f0631339a1
Gerrit-Change-Number: 20477
Gerrit-PatchSet: 1
Gerrit-Owner: Michael Blow <[email protected]>