github-actions[bot] commented on code in PR #60799:
URL: https://github.com/apache/doris/pull/60799#discussion_r2878179104
##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,261 @@ struct FindInSetOp {
}
};
+struct NameLevenshtein {
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size();
+ const size_t n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ if (s == t) {
+ res = 0;
+ return;
+ }
+ // Guard against excessive O(m*n) computation on very large inputs.
+ // Limit each string to 65535 bytes (max VARCHAR length).
+ constexpr size_t MAX_INPUT_LEN = 65535;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for levenshtein, max
{} bytes",
+ MAX_INPUT_LEN);
+ }
+ constexpr size_t STACK_MAX = 512;
+ int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+ std::vector<int32_t> prev_heap, curr_heap;
+ int32_t* prev_row;
+ int32_t* curr_row;
+ if (n <= STACK_MAX) {
+ prev_row = prev_stk;
+ curr_row = curr_stk;
+ } else {
+ prev_heap.resize(n + 1);
+ curr_heap.resize(n + 1);
+ prev_row = prev_heap.data();
+ curr_row = curr_heap.data();
+ }
+ for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+ for (size_t i = 1; i <= m; ++i) {
+ curr_row[0] = static_cast<int32_t>(i);
+ for (size_t j = 1; j <= n; ++j) {
+ if (s[i - 1] == t[j - 1]) {
+ curr_row[j] = prev_row[j - 1];
+ } else {
+ curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j],
curr_row[j - 1]});
+ }
+ }
+ std::swap(prev_row, curr_row);
+ }
+ res = prev_row[n];
+ }
+};
+
+struct NameDamerauLevenshtein {
+ static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size(), n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ // Guard against OOM: the DP matrix allocates
(m+2)*(n+2)*sizeof(int32_t) bytes.
+ // At 10000 chars each that is ~400 MB; cap well below VARCHAR max
(65535).
+ constexpr size_t MAX_INPUT_LEN = 10000;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for
damerau_levenshtein, max {} bytes",
+ MAX_INPUT_LEN);
+ }
+ const size_t stride = n + 2;
+ const int32_t max_dist = static_cast<int32_t>(m + n);
+ std::vector<int32_t> d((m + 2) * stride, 0);
+ d[0] = max_dist;
+ for (size_t i = 0; i <= m; ++i) {
+ d[(i + 1) * stride + 0] = max_dist;
+ d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+ }
+ for (size_t j = 0; j <= n; ++j) {
+ d[0 * stride + (j + 1)] = max_dist;
+ d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+ }
+ int32_t da[256] = {};
+ for (size_t i = 1; i <= m; ++i) {
+ int32_t db = 0;
+ for (size_t j = 1; j <= n; ++j) {
+ const int32_t k = da[(uint8_t)t[j - 1]];
+ const int32_t l = db;
+ int32_t cost;
+ if (s[i - 1] == t[j - 1]) {
+ cost = 0;
+ db = static_cast<int32_t>(j);
+ } else {
+ cost = 1;
+ }
+ const int32_t trans = d[static_cast<size_t>(k) * stride +
static_cast<size_t>(l)] +
+ (static_cast<int32_t>(i) - k - 1) + 1 +
+ (static_cast<int32_t>(j) - l - 1);
+ d[(i + 1) * stride + (j + 1)] =
+ std::min({d[i * stride + j] + cost, d[(i + 1) * stride
+ j] + 1,
+ d[i * stride + (j + 1)] + 1, trans});
+ }
+ da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+ }
+ res = d[(m + 1) * stride + (n + 1)];
+ }
+};
+
+struct NameJaroWinkler {
+ static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
+ using ResultDataType = DataTypeFloat64;
+ using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
double& res) {
+ if (s == t) {
+ res = 1.0;
+ return;
+ }
+ const size_t m = s.size(), n = t.size();
+ if (m == 0 || n == 0) {
+ res = 0.0;
+ return;
+ }
+ const size_t match_dist = std::max(m, n) / 2 - (std::max(m, n) >= 2 ?
1 : 0);
Review Comment:
**[HIGH] Missing input length guard.** `JaroWinklerOp` has O(m * match_dist)
= O(m * n/2) time complexity in its matching loop (lines 453-462 in the
source), but unlike `LevenshteinOp` (capped at 65535 bytes) and
`DamerauLevenshteinOp` (capped at 10000 bytes), there is no length guard here.
Since the function accepts `STRING` type (up to ~2GB), a malicious or
accidental query like `SELECT jaro_winkler(repeat('x', 100000), repeat('y',
100000))` would perform ~5 billion iterations per row. Additionally, the heap
allocation at the else-branch (`s_heap.assign(m, 0)`) could allocate gigabytes
of memory without MemTracker awareness.
Suggested fix — add a guard after computing `m` and `n`, consistent with the
other functions:
```cpp
constexpr size_t MAX_INPUT_LEN = 65535;
if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Input string too long for jaro_winkler, max {}
bytes",
MAX_INPUT_LEN);
}
```
##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,261 @@ struct FindInSetOp {
}
};
+struct NameLevenshtein {
+ static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size();
+ const size_t n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ if (s == t) {
+ res = 0;
+ return;
+ }
+ // Guard against excessive O(m*n) computation on very large inputs.
+ // Limit each string to 65535 bytes (max VARCHAR length).
+ constexpr size_t MAX_INPUT_LEN = 65535;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for levenshtein, max
{} bytes",
+ MAX_INPUT_LEN);
+ }
+ constexpr size_t STACK_MAX = 512;
+ int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+ std::vector<int32_t> prev_heap, curr_heap;
+ int32_t* prev_row;
+ int32_t* curr_row;
+ if (n <= STACK_MAX) {
+ prev_row = prev_stk;
+ curr_row = curr_stk;
+ } else {
+ prev_heap.resize(n + 1);
+ curr_heap.resize(n + 1);
+ prev_row = prev_heap.data();
+ curr_row = curr_heap.data();
+ }
+ for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+ for (size_t i = 1; i <= m; ++i) {
+ curr_row[0] = static_cast<int32_t>(i);
+ for (size_t j = 1; j <= n; ++j) {
+ if (s[i - 1] == t[j - 1]) {
+ curr_row[j] = prev_row[j - 1];
+ } else {
+ curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j],
curr_row[j - 1]});
+ }
+ }
+ std::swap(prev_row, curr_row);
+ }
+ res = prev_row[n];
+ }
+};
+
+struct NameDamerauLevenshtein {
+ static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+ using ResultDataType = DataTypeInt32;
+ using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
int32_t& res) {
+ const size_t m = s.size(), n = t.size();
+ if (m == 0) {
+ res = static_cast<int32_t>(n);
+ return;
+ }
+ if (n == 0) {
+ res = static_cast<int32_t>(m);
+ return;
+ }
+ // Guard against OOM: the DP matrix allocates
(m+2)*(n+2)*sizeof(int32_t) bytes.
+ // At 10000 chars each that is ~400 MB; cap well below VARCHAR max
(65535).
+ constexpr size_t MAX_INPUT_LEN = 10000;
+ if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+ throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+ "Input string too long for
damerau_levenshtein, max {} bytes",
+ MAX_INPUT_LEN);
+ }
+ const size_t stride = n + 2;
+ const int32_t max_dist = static_cast<int32_t>(m + n);
+ std::vector<int32_t> d((m + 2) * stride, 0);
+ d[0] = max_dist;
+ for (size_t i = 0; i <= m; ++i) {
+ d[(i + 1) * stride + 0] = max_dist;
+ d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+ }
+ for (size_t j = 0; j <= n; ++j) {
+ d[0 * stride + (j + 1)] = max_dist;
+ d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+ }
+ int32_t da[256] = {};
+ for (size_t i = 1; i <= m; ++i) {
+ int32_t db = 0;
+ for (size_t j = 1; j <= n; ++j) {
+ const int32_t k = da[(uint8_t)t[j - 1]];
+ const int32_t l = db;
+ int32_t cost;
+ if (s[i - 1] == t[j - 1]) {
+ cost = 0;
+ db = static_cast<int32_t>(j);
+ } else {
+ cost = 1;
+ }
+ const int32_t trans = d[static_cast<size_t>(k) * stride +
static_cast<size_t>(l)] +
+ (static_cast<int32_t>(i) - k - 1) + 1 +
+ (static_cast<int32_t>(j) - l - 1);
+ d[(i + 1) * stride + (j + 1)] =
+ std::min({d[i * stride + j] + cost, d[(i + 1) * stride
+ j] + 1,
+ d[i * stride + (j + 1)] + 1, trans});
+ }
+ da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+ }
+ res = d[(m + 1) * stride + (n + 1)];
+ }
+};
+
+struct NameJaroWinkler {
+ static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
+ using ResultDataType = DataTypeFloat64;
+ using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
double& res) {
+ if (s == t) {
+ res = 1.0;
+ return;
+ }
+ const size_t m = s.size(), n = t.size();
+ if (m == 0 || n == 0) {
+ res = 0.0;
+ return;
+ }
+ const size_t match_dist = std::max(m, n) / 2 - (std::max(m, n) >= 2 ?
1 : 0);
+ constexpr size_t STACK_MAX = 512;
+ uint8_t s_stk[STACK_MAX] = {};
+ uint8_t t_stk[STACK_MAX] = {};
+ std::vector<uint8_t> s_heap, t_heap;
+ uint8_t* sm;
+ uint8_t* tm;
+ if (m <= STACK_MAX && n <= STACK_MAX) {
+ sm = s_stk;
+ tm = t_stk;
+ } else {
+ s_heap.assign(m, 0);
+ t_heap.assign(n, 0);
+ sm = s_heap.data();
+ tm = t_heap.data();
+ }
+ double matches = 0;
+ for (size_t i = 0; i < m; ++i) {
+ const size_t start = (i >= match_dist) ? i - match_dist : 0;
+ const size_t end = std::min(i + match_dist + 1, n);
+ for (size_t j = start; j < end; ++j) {
+ if (!tm[j] && s[i] == t[j]) {
+ sm[i] = tm[j] = 1;
+ ++matches;
+ break;
+ }
+ }
+ }
+ if (matches == 0.0) {
+ res = 0.0;
+ return;
+ }
+ double transpositions = 0;
+ size_t k = 0;
+ for (size_t i = 0; i < m; ++i) {
+ if (sm[i]) {
+ while (!tm[k]) ++k;
+ if (s[i] != t[k]) ++transpositions;
+ ++k;
+ }
+ }
+ const double jaro = (matches / static_cast<double>(m) + matches /
static_cast<double>(n) +
+ (matches - transpositions / 2.0) / matches) /
+ 3.0;
+ size_t prefix = 0;
+ const size_t max_prefix = std::min(static_cast<size_t>(4), std::min(m,
n));
+ while (prefix < max_prefix && s[prefix] == t[prefix]) ++prefix;
+ res = jaro + static_cast<double>(prefix) * 0.1 * (1.0 - jaro);
+ }
+};
+
+struct NameJaccardSimilarity {
+ static constexpr auto name = "jaccard_similarity";
+};
+
+struct JaccardSimilarityOp {
+ using ResultDataType = DataTypeFloat64;
+ using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+ static void execute(const std::string_view& s, const std::string_view& t,
double& res) {
+ if (s == t) {
+ res = 1.0;
+ return;
+ }
+ if (s.size() < 2 || t.size() < 2) {
+ res = 0.0;
+ return;
+ }
+ const size_t m = s.size(), n = t.size();
Review Comment:
**[MEDIUM] Missing input length guard.** `JaccardSimilarityOp` has no input
length validation. While its core algorithm is O(m log m + n log n) (less
critical than Jaro-Winkler's quadratic), the heap path at line 539
(`std::vector<uint16_t>(m - 1)`) allocates `2 * (m-1)` bytes untracked by
MemTracker. For a STRING-type input near the ~2GB limit, this could attempt a
~4GB heap allocation.
For consistency with the other functions and to prevent OOM, consider adding
a length guard:
```cpp
constexpr size_t MAX_INPUT_LEN = 65535;
if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
"Input string too long for jaccard_similarity,
max {} bytes",
MAX_INPUT_LEN);
}
```
--
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
To unsubscribe, e-mail: [email protected]
For queries about this service, please contact Infrastructure at:
[email protected]
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]