zclllyybb commented on code in PR #59016:
URL: https://github.com/apache/doris/pull/59016#discussion_r2616353581
##########
be/src/vec/functions/function_string.h:
##########
@@ -5261,6 +5261,96 @@ class FunctionCrc32Internal : public IFunction {
}
};
+class NameLevenshtein {
+public:
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinImpl {
+ static constexpr auto name = "levenshtein";
+
+ // 必需的类型定义
Review Comment:
dont use chinese
##########
regression-test/data/query_p0/sql_functions/string_functions/test_string_function_levenshtein.out:
##########
Review Comment:
doesn't match with your testcase
##########
be/src/vec/functions/function_string.h:
##########
@@ -5261,6 +5261,96 @@ class FunctionCrc32Internal : public IFunction {
}
};
+class NameLevenshtein {
+public:
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinImpl {
+ static constexpr auto name = "levenshtein";
+
+ // 必需的类型定义
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = ColumnInt32::Container;
+
+ static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
+ return std::make_shared<ResultDataType>();
+ }
+
+ // Helper: Get UTF-8 byte length
+ static size_t get_utf8_byte_length(unsigned char byte) {
+ if (byte < 0x80) return 1;
+ if ((byte & 0xE0) == 0xC0) return 2;
+ if ((byte & 0xF0) == 0xE0) return 3;
+ if ((byte & 0xF8) == 0xF0) return 4;
+ return 1;
+ }
+
+ // Helper: Convert UTF-8 byte stream to Unicode code points
+ static void to_utf32(const char* data, size_t size, std::vector<int32_t>&
out) {
+ out.clear();
+ size_t i = 0;
+ while (i < size) {
+ size_t char_len = get_utf8_byte_length(static_cast<unsigned
char>(data[i]));
+ if (i + char_len > size) char_len = 1;
+
+ int32_t code_point = 0;
+ for (size_t j = 0; j < char_len; ++j) {
+ code_point = (code_point << 8) | static_cast<unsigned
char>(data[i + j]);
+ }
+ out.push_back(code_point);
+ i += char_len;
+ }
+ }
+
+ // 核心执行函数: 修正参数类型为 std::string_view
+ static void execute(std::string_view l, std::string_view r, int32_t& res) {
+ // Use thread_local for memory reuse
+ static thread_local std::vector<int32_t> a_code_points;
+ static thread_local std::vector<int32_t> b_code_points;
+
+ // ADMIN REQUIREMENT: Renaming or adding clear comments for these two
DP arrays
+ static thread_local std::vector<int32_t> prev_row; // Previous row
distances (i-1)
+ static thread_local std::vector<int32_t> curr_row; // Current row
distances (i)
+
+ // 注意:std::string_view 使用 .data() 和 .size() 方法
+ to_utf32(l.data(), l.size(), a_code_points);
+ to_utf32(r.data(), r.size(), b_code_points);
+
+ const auto& source = a_code_points;
+ const auto& target = b_code_points;
+
+ size_t n = source.size();
+ size_t m = target.size();
+
+ if (n == 0) {
Review Comment:
add `unlikely` here
##########
be/src/vec/functions/function_string.h:
##########
@@ -5261,6 +5261,96 @@ class FunctionCrc32Internal : public IFunction {
}
};
+class NameLevenshtein {
+public:
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinImpl {
+ static constexpr auto name = "levenshtein";
+
+ // 必需的类型定义
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = ColumnInt32::Container;
+
+ static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
+ return std::make_shared<ResultDataType>();
+ }
+
+ // Helper: Get UTF-8 byte length
+ static size_t get_utf8_byte_length(unsigned char byte) {
+ if (byte < 0x80) return 1;
+ if ((byte & 0xE0) == 0xC0) return 2;
+ if ((byte & 0xF0) == 0xE0) return 3;
+ if ((byte & 0xF8) == 0xF0) return 4;
+ return 1;
+ }
+
+ // Helper: Convert UTF-8 byte stream to Unicode code points
+ static void to_utf32(const char* data, size_t size, std::vector<int32_t>&
out) {
+ out.clear();
+ size_t i = 0;
+ while (i < size) {
+ size_t char_len = get_utf8_byte_length(static_cast<unsigned
char>(data[i]));
+ if (i + char_len > size) char_len = 1;
+
+ int32_t code_point = 0;
+ for (size_t j = 0; j < char_len; ++j) {
+ code_point = (code_point << 8) | static_cast<unsigned
char>(data[i + j]);
+ }
+ out.push_back(code_point);
+ i += char_len;
+ }
+ }
+
+ // 核心执行函数: 修正参数类型为 std::string_view
+ static void execute(std::string_view l, std::string_view r, int32_t& res) {
+ // Use thread_local for memory reuse
+ static thread_local std::vector<int32_t> a_code_points;
+ static thread_local std::vector<int32_t> b_code_points;
+
+ // ADMIN REQUIREMENT: Renaming or adding clear comments for these two
DP arrays
+ static thread_local std::vector<int32_t> prev_row; // Previous row
distances (i-1)
Review Comment:
why make it `static thread_local`? it's wrong.
##########
be/src/vec/functions/function_string.h:
##########
@@ -5261,6 +5261,96 @@ class FunctionCrc32Internal : public IFunction {
}
};
+class NameLevenshtein {
+public:
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinImpl {
+ static constexpr auto name = "levenshtein";
+
+ // 必需的类型定义
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = ColumnInt32::Container;
+
+ static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
+ return std::make_shared<ResultDataType>();
+ }
+
+ // Helper: Get UTF-8 byte length
+ static size_t get_utf8_byte_length(unsigned char byte) {
+ if (byte < 0x80) return 1;
+ if ((byte & 0xE0) == 0xC0) return 2;
+ if ((byte & 0xF0) == 0xE0) return 3;
+ if ((byte & 0xF8) == 0xF0) return 4;
+ return 1;
+ }
+
+ // Helper: Convert UTF-8 byte stream to Unicode code points
+ static void to_utf32(const char* data, size_t size, std::vector<int32_t>&
out) {
+ out.clear();
+ size_t i = 0;
+ while (i < size) {
+ size_t char_len = get_utf8_byte_length(static_cast<unsigned
char>(data[i]));
+ if (i + char_len > size) char_len = 1;
+
+ int32_t code_point = 0;
+ for (size_t j = 0; j < char_len; ++j) {
+ code_point = (code_point << 8) | static_cast<unsigned
char>(data[i + j]);
+ }
+ out.push_back(code_point);
+ i += char_len;
+ }
+ }
+
+ // 核心执行函数: 修正参数类型为 std::string_view
+ static void execute(std::string_view l, std::string_view r, int32_t& res) {
+ // Use thread_local for memory reuse
+ static thread_local std::vector<int32_t> a_code_points;
+ static thread_local std::vector<int32_t> b_code_points;
+
+ // ADMIN REQUIREMENT: Renaming or adding clear comments for these two
DP arrays
+ static thread_local std::vector<int32_t> prev_row; // Previous row
distances (i-1)
+ static thread_local std::vector<int32_t> curr_row; // Current row
distances (i)
+
+ // 注意:std::string_view 使用 .data() 和 .size() 方法
+ to_utf32(l.data(), l.size(), a_code_points);
Review Comment:
see other funcitons how deal with utf8. like `SubstringUtil`.
##########
regression-test/suites/query_p0/sql_functions/string_functions/test_string_function_levenshtein.groovy:
##########
@@ -0,0 +1,68 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_string_function_levenshtein") {
+ // 1. Constant Value Tests (Sanity Check)
+ qt_select_const "SELECT levenshtein('kitten', 'sitting')"
+ qt_select_const "SELECT levenshtein('hello', 'hello')"
+ qt_select_const "SELECT levenshtein('abc', '')"
+ qt_select_const "SELECT levenshtein('', 'def')"
+ // UTF-8 fix check: '中国' (2 chars) vs '中' (1 char) -> Distance 1
+ qt_select_const "SELECT levenshtein('中国', '中')"
+ // UTF-8 fix check: '测试' (2 chars) vs '测验' (2 chars) -> Distance 1
+ qt_select_const "SELECT levenshtein('测试', '测验')"
+ qt_select_const "SELECT levenshtein(NULL, 'abc')"
+
+ // 2. Prepare Table Data for Column Tests
+ def tableName = "test_levenshtein_tbl"
+ sql "DROP TABLE IF EXISTS ${tableName}"
+ sql """
+ CREATE TABLE ${tableName} (
+ `id` int,
+ `s1` string,
+ `s2` string
+ ) DISTRIBUTED BY HASH(id) BUCKETS 1
+ PROPERTIES (
+ "replication_num" = "1"
+ )
+ """
+
+ // Insert data covering boundary and UTF-8 cases
+ sql """
+ INSERT INTO ${tableName} VALUES
+ (1, 'kitten', 'sitting'),
+ (2, 'rosettacode', 'raisethysword'),
+ (3, 'abc', 'abc'),
+ (4, '', 'abc'),
+ (5, 'abc', ''),
+ (6, '中国', '中'),
+ (7, '测试', '测验'),
+ (8, NULL, 'abc'),
+ (9, 'abc', NULL)
+ """
+
+ // 3. Column vs Column Test
+ qt_select_col_col "SELECT id, levenshtein(s1, s2) FROM ${tableName} ORDER
BY id"
Review Comment:
?
##########
be/src/vec/functions/function_string.h:
##########
@@ -5261,6 +5261,96 @@ class FunctionCrc32Internal : public IFunction {
}
};
+class NameLevenshtein {
+public:
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinImpl {
+ static constexpr auto name = "levenshtein";
+
+ // 必需的类型定义
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = ColumnInt32::Container;
+
+ static DataTypePtr get_return_type_impl(const DataTypes& arguments) {
+ return std::make_shared<ResultDataType>();
+ }
+
+ // Helper: Get UTF-8 byte length
+ static size_t get_utf8_byte_length(unsigned char byte) {
+ if (byte < 0x80) return 1;
+ if ((byte & 0xE0) == 0xC0) return 2;
+ if ((byte & 0xF0) == 0xE0) return 3;
+ if ((byte & 0xF8) == 0xF0) return 4;
+ return 1;
+ }
+
+ // Helper: Convert UTF-8 byte stream to Unicode code points
+ static void to_utf32(const char* data, size_t size, std::vector<int32_t>&
out) {
+ out.clear();
+ size_t i = 0;
+ while (i < size) {
+ size_t char_len = get_utf8_byte_length(static_cast<unsigned
char>(data[i]));
+ if (i + char_len > size) char_len = 1;
+
+ int32_t code_point = 0;
+ for (size_t j = 0; j < char_len; ++j) {
+ code_point = (code_point << 8) | static_cast<unsigned
char>(data[i + j]);
+ }
+ out.push_back(code_point);
+ i += char_len;
+ }
+ }
+
+ // 核心执行函数: 修正参数类型为 std::string_view
+ static void execute(std::string_view l, std::string_view r, int32_t& res) {
+ // Use thread_local for memory reuse
+ static thread_local std::vector<int32_t> a_code_points;
+ static thread_local std::vector<int32_t> b_code_points;
+
+ // ADMIN REQUIREMENT: Renaming or adding clear comments for these two
DP arrays
Review Comment:
?
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]