github-actions[bot] commented on code in PR #60799:
URL: https://github.com/apache/doris/pull/60799#discussion_r2878179104


##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,261 @@ struct FindInSetOp {
     }
 };
 
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size();
+        const size_t n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        if (s == t) {
+            res = 0;
+            return;
+        }
+        // Guard against excessive O(m*n) computation on very large inputs.
+        // Limit each string to 65535 bytes (max VARCHAR length).
+        constexpr size_t MAX_INPUT_LEN = 65535;
+        if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                   "Input string too long for levenshtein, max 
{} bytes",
+                                   MAX_INPUT_LEN);
+        }
+        constexpr size_t STACK_MAX = 512;
+        int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+        std::vector<int32_t> prev_heap, curr_heap;
+        int32_t* prev_row;
+        int32_t* curr_row;
+        if (n <= STACK_MAX) {
+            prev_row = prev_stk;
+            curr_row = curr_stk;
+        } else {
+            prev_heap.resize(n + 1);
+            curr_heap.resize(n + 1);
+            prev_row = prev_heap.data();
+            curr_row = curr_heap.data();
+        }
+        for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+        for (size_t i = 1; i <= m; ++i) {
+            curr_row[0] = static_cast<int32_t>(i);
+            for (size_t j = 1; j <= n; ++j) {
+                if (s[i - 1] == t[j - 1]) {
+                    curr_row[j] = prev_row[j - 1];
+                } else {
+                    curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j], 
curr_row[j - 1]});
+                }
+            }
+            std::swap(prev_row, curr_row);
+        }
+        res = prev_row[n];
+    }
+};
+
+struct NameDamerauLevenshtein {
+    static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size(), n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        // Guard against OOM: the DP matrix allocates 
(m+2)*(n+2)*sizeof(int32_t) bytes.
+        // At 10000 chars each that is ~400 MB; cap well below VARCHAR max 
(65535).
+        constexpr size_t MAX_INPUT_LEN = 10000;
+        if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                   "Input string too long for 
damerau_levenshtein, max {} bytes",
+                                   MAX_INPUT_LEN);
+        }
+        const size_t stride = n + 2;
+        const int32_t max_dist = static_cast<int32_t>(m + n);
+        std::vector<int32_t> d((m + 2) * stride, 0);
+        d[0] = max_dist;
+        for (size_t i = 0; i <= m; ++i) {
+            d[(i + 1) * stride + 0] = max_dist;
+            d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+        }
+        for (size_t j = 0; j <= n; ++j) {
+            d[0 * stride + (j + 1)] = max_dist;
+            d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+        }
+        int32_t da[256] = {};
+        for (size_t i = 1; i <= m; ++i) {
+            int32_t db = 0;
+            for (size_t j = 1; j <= n; ++j) {
+                const int32_t k = da[(uint8_t)t[j - 1]];
+                const int32_t l = db;
+                int32_t cost;
+                if (s[i - 1] == t[j - 1]) {
+                    cost = 0;
+                    db = static_cast<int32_t>(j);
+                } else {
+                    cost = 1;
+                }
+                const int32_t trans = d[static_cast<size_t>(k) * stride + 
static_cast<size_t>(l)] +
+                                      (static_cast<int32_t>(i) - k - 1) + 1 +
+                                      (static_cast<int32_t>(j) - l - 1);
+                d[(i + 1) * stride + (j + 1)] =
+                        std::min({d[i * stride + j] + cost, d[(i + 1) * stride 
+ j] + 1,
+                                  d[i * stride + (j + 1)] + 1, trans});
+            }
+            da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+        }
+        res = d[(m + 1) * stride + (n + 1)];
+    }
+};
+
+struct NameJaroWinkler {
+    static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
+    using ResultDataType = DataTypeFloat64;
+    using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
double& res) {
+        if (s == t) {
+            res = 1.0;
+            return;
+        }
+        const size_t m = s.size(), n = t.size();
+        if (m == 0 || n == 0) {
+            res = 0.0;
+            return;
+        }
+        const size_t match_dist = std::max(m, n) / 2 - (std::max(m, n) >= 2 ? 
1 : 0);

Review Comment:
   **[HIGH] Missing input length guard.** `JaroWinklerOp` has O(m * match_dist) 
= O(m * n/2) time complexity in its matching loop (lines 453-462 in the 
source), but unlike `LevenshteinOp` (capped at 65535 bytes) and 
`DamerauLevenshteinOp` (capped at 10000 bytes), there is no length guard here.
   
   Since the function accepts `STRING` type (up to ~2GB), a malicious or 
accidental query like `SELECT jaro_winkler(repeat('x', 100000), repeat('y', 
100000))` would perform ~5 billion iterations per row. Additionally, the heap 
allocation at the else-branch (`s_heap.assign(m, 0)`) could allocate gigabytes 
of memory without MemTracker awareness.
   
   Suggested fix — add a guard after computing `m` and `n`, consistent with the 
other functions:
   ```cpp
   constexpr size_t MAX_INPUT_LEN = 65535;
   if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
       throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
                              "Input string too long for jaro_winkler, max {} 
bytes",
                              MAX_INPUT_LEN);
   }
   ```



##########
be/src/vec/functions/function_string.cpp:
##########
@@ -290,6 +290,261 @@ struct FindInSetOp {
     }
 };
 
+struct NameLevenshtein {
+    static constexpr auto name = "levenshtein";
+};
+
+struct LevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size();
+        const size_t n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        if (s == t) {
+            res = 0;
+            return;
+        }
+        // Guard against excessive O(m*n) computation on very large inputs.
+        // Limit each string to 65535 bytes (max VARCHAR length).
+        constexpr size_t MAX_INPUT_LEN = 65535;
+        if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                   "Input string too long for levenshtein, max 
{} bytes",
+                                   MAX_INPUT_LEN);
+        }
+        constexpr size_t STACK_MAX = 512;
+        int32_t prev_stk[STACK_MAX + 1], curr_stk[STACK_MAX + 1];
+        std::vector<int32_t> prev_heap, curr_heap;
+        int32_t* prev_row;
+        int32_t* curr_row;
+        if (n <= STACK_MAX) {
+            prev_row = prev_stk;
+            curr_row = curr_stk;
+        } else {
+            prev_heap.resize(n + 1);
+            curr_heap.resize(n + 1);
+            prev_row = prev_heap.data();
+            curr_row = curr_heap.data();
+        }
+        for (size_t j = 0; j <= n; ++j) prev_row[j] = static_cast<int32_t>(j);
+        for (size_t i = 1; i <= m; ++i) {
+            curr_row[0] = static_cast<int32_t>(i);
+            for (size_t j = 1; j <= n; ++j) {
+                if (s[i - 1] == t[j - 1]) {
+                    curr_row[j] = prev_row[j - 1];
+                } else {
+                    curr_row[j] = 1 + std::min({prev_row[j - 1], prev_row[j], 
curr_row[j - 1]});
+                }
+            }
+            std::swap(prev_row, curr_row);
+        }
+        res = prev_row[n];
+    }
+};
+
+struct NameDamerauLevenshtein {
+    static constexpr auto name = "damerau_levenshtein";
+};
+
+struct DamerauLevenshteinOp {
+    using ResultDataType = DataTypeInt32;
+    using ResultPaddedPODArray = PaddedPODArray<Int32>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
int32_t& res) {
+        const size_t m = s.size(), n = t.size();
+        if (m == 0) {
+            res = static_cast<int32_t>(n);
+            return;
+        }
+        if (n == 0) {
+            res = static_cast<int32_t>(m);
+            return;
+        }
+        // Guard against OOM: the DP matrix allocates 
(m+2)*(n+2)*sizeof(int32_t) bytes.
+        // At 10000 chars each that is ~400 MB; cap well below VARCHAR max 
(65535).
+        constexpr size_t MAX_INPUT_LEN = 10000;
+        if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
+            throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
+                                   "Input string too long for 
damerau_levenshtein, max {} bytes",
+                                   MAX_INPUT_LEN);
+        }
+        const size_t stride = n + 2;
+        const int32_t max_dist = static_cast<int32_t>(m + n);
+        std::vector<int32_t> d((m + 2) * stride, 0);
+        d[0] = max_dist;
+        for (size_t i = 0; i <= m; ++i) {
+            d[(i + 1) * stride + 0] = max_dist;
+            d[(i + 1) * stride + 1] = static_cast<int32_t>(i);
+        }
+        for (size_t j = 0; j <= n; ++j) {
+            d[0 * stride + (j + 1)] = max_dist;
+            d[1 * stride + (j + 1)] = static_cast<int32_t>(j);
+        }
+        int32_t da[256] = {};
+        for (size_t i = 1; i <= m; ++i) {
+            int32_t db = 0;
+            for (size_t j = 1; j <= n; ++j) {
+                const int32_t k = da[(uint8_t)t[j - 1]];
+                const int32_t l = db;
+                int32_t cost;
+                if (s[i - 1] == t[j - 1]) {
+                    cost = 0;
+                    db = static_cast<int32_t>(j);
+                } else {
+                    cost = 1;
+                }
+                const int32_t trans = d[static_cast<size_t>(k) * stride + 
static_cast<size_t>(l)] +
+                                      (static_cast<int32_t>(i) - k - 1) + 1 +
+                                      (static_cast<int32_t>(j) - l - 1);
+                d[(i + 1) * stride + (j + 1)] =
+                        std::min({d[i * stride + j] + cost, d[(i + 1) * stride 
+ j] + 1,
+                                  d[i * stride + (j + 1)] + 1, trans});
+            }
+            da[(uint8_t)s[i - 1]] = static_cast<int32_t>(i);
+        }
+        res = d[(m + 1) * stride + (n + 1)];
+    }
+};
+
+struct NameJaroWinkler {
+    static constexpr auto name = "jaro_winkler";
+};
+
+struct JaroWinklerOp {
+    using ResultDataType = DataTypeFloat64;
+    using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
double& res) {
+        if (s == t) {
+            res = 1.0;
+            return;
+        }
+        const size_t m = s.size(), n = t.size();
+        if (m == 0 || n == 0) {
+            res = 0.0;
+            return;
+        }
+        const size_t match_dist = std::max(m, n) / 2 - (std::max(m, n) >= 2 ? 
1 : 0);
+        constexpr size_t STACK_MAX = 512;
+        uint8_t s_stk[STACK_MAX] = {};
+        uint8_t t_stk[STACK_MAX] = {};
+        std::vector<uint8_t> s_heap, t_heap;
+        uint8_t* sm;
+        uint8_t* tm;
+        if (m <= STACK_MAX && n <= STACK_MAX) {
+            sm = s_stk;
+            tm = t_stk;
+        } else {
+            s_heap.assign(m, 0);
+            t_heap.assign(n, 0);
+            sm = s_heap.data();
+            tm = t_heap.data();
+        }
+        double matches = 0;
+        for (size_t i = 0; i < m; ++i) {
+            const size_t start = (i >= match_dist) ? i - match_dist : 0;
+            const size_t end = std::min(i + match_dist + 1, n);
+            for (size_t j = start; j < end; ++j) {
+                if (!tm[j] && s[i] == t[j]) {
+                    sm[i] = tm[j] = 1;
+                    ++matches;
+                    break;
+                }
+            }
+        }
+        if (matches == 0.0) {
+            res = 0.0;
+            return;
+        }
+        double transpositions = 0;
+        size_t k = 0;
+        for (size_t i = 0; i < m; ++i) {
+            if (sm[i]) {
+                while (!tm[k]) ++k;
+                if (s[i] != t[k]) ++transpositions;
+                ++k;
+            }
+        }
+        const double jaro = (matches / static_cast<double>(m) + matches / 
static_cast<double>(n) +
+                             (matches - transpositions / 2.0) / matches) /
+                            3.0;
+        size_t prefix = 0;
+        const size_t max_prefix = std::min(static_cast<size_t>(4), std::min(m, 
n));
+        while (prefix < max_prefix && s[prefix] == t[prefix]) ++prefix;
+        res = jaro + static_cast<double>(prefix) * 0.1 * (1.0 - jaro);
+    }
+};
+
+struct NameJaccardSimilarity {
+    static constexpr auto name = "jaccard_similarity";
+};
+
+struct JaccardSimilarityOp {
+    using ResultDataType = DataTypeFloat64;
+    using ResultPaddedPODArray = PaddedPODArray<Float64>;
+
+    static void execute(const std::string_view& s, const std::string_view& t, 
double& res) {
+        if (s == t) {
+            res = 1.0;
+            return;
+        }
+        if (s.size() < 2 || t.size() < 2) {
+            res = 0.0;
+            return;
+        }
+        const size_t m = s.size(), n = t.size();

Review Comment:
   **[MEDIUM] Missing input length guard.** `JaccardSimilarityOp` has no input 
length validation. While its core algorithm is O(m log m + n log n) (less 
critical than Jaro-Winkler's quadratic), the heap path at line 539 
(`std::vector<uint16_t>(m - 1)`) allocates `2 * (m-1)` bytes untracked by 
MemTracker. For a STRING-type input near the ~2GB limit, this could attempt a 
~4GB heap allocation.
   
   For consistency with the other functions and to prevent OOM, consider adding 
a length guard:
   ```cpp
   constexpr size_t MAX_INPUT_LEN = 65535;
   if (m > MAX_INPUT_LEN || n > MAX_INPUT_LEN) {
       throw doris::Exception(ErrorCode::INVALID_ARGUMENT,
                              "Input string too long for jaccard_similarity, 
max {} bytes",
                              MAX_INPUT_LEN);
   }
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to