This is an automated email from the ASF dual-hosted git repository.

zhangstar333 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new e9500b3e9b2 [opt](fucntion) improve json_extract function handle const 
column (#36927)
e9500b3e9b2 is described below

commit e9500b3e9b2be42434bb0fb970e4bcb89774bd60
Author: zhangstar333 <87313068+zhangstar...@users.noreply.github.com>
AuthorDate: Wed Jul 3 20:19:16 2024 +0800

    [opt](fucntion) improve json_extract function handle const column (#36927)
    
    ## Proposed changes
    VARCHAR json_extract(VARCHAR json_str, VARCHAR path[, VARCHAR path] ...)
    ```
      for (int i = 0; i < arguments.size(); i++) {
           column_ptrs.push_back(
                   
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
           data_columns.push_back(assert_cast<const 
ColumnString*>(column_ptrs.back().get()));
       }
    
    before not handle const column, as the input arguments are variadic,
    and most user case is like: json_extract(column, '$.fparam.nested_2')
    so could special deal with two arguments could reuse json document.
    
    ```
    
    ```
    mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from 
json_table_2;
    +---------------------------------------------+
    | count(json_extract(a, '$.fparam.nested_2')) |
    +---------------------------------------------+
    |                                    10000001 |
    +---------------------------------------------+
    1 row in set (1.06 sec)
    
    mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from 
json_table_2;
    +---------------------------------------------+
    | count(json_extract(a, '$.fparam.nested_2')) |
    +---------------------------------------------+
    |                                    10000001 |
    +---------------------------------------------+
    1 row in set (1.02 sec)
    
    mysql [test]>
    mysql [test]>
    mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from 
json_table_2;
    +---------------------------------------------+
    | count(json_extract(a, '$.fparam.nested_2')) |
    +---------------------------------------------+
    |                                    10000001 |
    +---------------------------------------------+
    1 row in set (44.22 sec)
    
    mysql [test]>select count(json_extract(a, '$.fparam.nested_2')) from 
json_table_2;
    +---------------------------------------------+
    | count(json_extract(a, '$.fparam.nested_2')) |
    +---------------------------------------------+
    |                                    10000001 |
    +---------------------------------------------+
    1 row in set (42.80 sec)
    
    ```
    
    <!--Describe your changes.-->
---
 be/src/vec/functions/function_json.cpp | 125 ++++++++++++++++++++++++++-------
 1 file changed, 98 insertions(+), 27 deletions(-)

diff --git a/be/src/vec/functions/function_json.cpp 
b/be/src/vec/functions/function_json.cpp
index e7c2fc1781d..2faeb24d514 100644
--- a/be/src/vec/functions/function_json.cpp
+++ b/be/src/vec/functions/function_json.cpp
@@ -826,44 +826,61 @@ struct FunctionJsonExtractImpl {
     static constexpr auto name = "json_extract";
 
     static rapidjson::Value parse_json(const ColumnString* json_col, const 
ColumnString* path_col,
-                                       rapidjson::Document::AllocatorType& 
allocator,
-                                       const int row) {
+                                       rapidjson::Document::AllocatorType& 
allocator, const int row,
+                                       const int col, std::vector<bool>& 
column_is_consts) {
         rapidjson::Value value;
         rapidjson::Document document;
 
-        const auto obj = json_col->get_data_at(row);
+        const auto obj = json_col->get_data_at(index_check_const(row, 
column_is_consts[0]));
         std::string_view json_string(obj.data, obj.size);
-        const auto path = path_col->get_data_at(row);
+        const auto path = path_col->get_data_at(index_check_const(row, 
column_is_consts[col]));
         std::string_view path_string(path.data, path.size);
-
-        auto root = get_json_object<JSON_FUN_STRING>(json_string, path_string, 
&document);
+        auto* root = get_json_object<JSON_FUN_STRING>(json_string, 
path_string, &document);
         if (root != nullptr) {
             value.CopyFrom(*root, allocator);
         }
         return value;
     }
 
+    static rapidjson::Value* get_document(const ColumnString* path_col,
+                                          rapidjson::Document* document,
+                                          std::vector<JsonPath>& parsed_paths, 
const int row,
+                                          bool is_const_column) {
+        const auto path = path_col->get_data_at(index_check_const(row, 
is_const_column));
+        std::string_view path_string(path.data, path.size);
+        //Cannot use '\' as the last character, return NULL
+        if (path_string.back() == '\\') {
+            document->SetNull();
+            return nullptr;
+        }
+
+#ifdef USE_LIBCPP
+        std::string s(path_string);
+        auto tok = get_json_token(s);
+#else
+        auto tok = get_json_token(path_string);
+#endif
+        // TODO: here maybe could use std::vector<std::string_view> or 
std::span
+        std::vector<std::string> paths(tok.begin(), tok.end());
+        get_parsed_paths(paths, &parsed_paths);
+        if (parsed_paths.empty()) {
+            return nullptr;
+        }
+        if (!(parsed_paths)[0].is_valid) {
+            return nullptr;
+        }
+        return document;
+    }
+
     static void execute(const std::vector<const ColumnString*>& data_columns,
-                        ColumnString& result_column, NullMap& null_map, size_t 
input_rows_count) {
+                        ColumnString& result_column, NullMap& null_map, size_t 
input_rows_count,
+                        std::vector<bool>& column_is_consts) {
         rapidjson::Document document;
         rapidjson::Document::AllocatorType& allocator = 
document.GetAllocator();
         rapidjson::StringBuffer buf;
         rapidjson::Writer<rapidjson::StringBuffer> writer(buf);
-
-        const auto json_col = data_columns[0];
-        for (size_t row = 0; row < input_rows_count; row++) {
-            rapidjson::Value value;
-            if (data_columns.size() == 2) {
-                value = parse_json(json_col, data_columns[1], allocator, row);
-            } else {
-                value.SetArray();
-                value.Reserve(data_columns.size() - 1, allocator);
-                for (size_t col = 1; col < data_columns.size(); ++col) {
-                    value.PushBack(parse_json(json_col, data_columns[col], 
allocator, row),
-                                   allocator);
-                }
-            }
-
+        const auto* json_col = data_columns[0];
+        auto insert_result_lambda = [&](rapidjson::Value& value, int row) {
             if (value.IsNull()) {
                 null_map[row] = 1;
                 result_column.insert_default();
@@ -874,6 +891,57 @@ struct FunctionJsonExtractImpl {
                 value.Accept(writer);
                 result_column.insert_data(buf.GetString(), buf.GetSize());
             }
+        };
+        if (data_columns.size() == 2) {
+            rapidjson::Value value;
+            if (column_is_consts[1]) {
+                std::vector<JsonPath> parsed_paths;
+                auto* root = get_document(data_columns[1], &document, 
parsed_paths, 0,
+                                          column_is_consts[1]);
+                for (size_t row = 0; row < input_rows_count; row++) {
+                    if (root != nullptr) {
+                        const auto& obj = json_col->get_data_at(row);
+                        std::string_view json_string(obj.data, obj.size);
+                        if (UNLIKELY((parsed_paths).size() == 1)) {
+                            document.SetString(json_string.data(), 
json_string.size(), allocator);
+                        }
+                        document.Parse(json_string.data(), json_string.size());
+                        if (UNLIKELY(document.HasParseError())) {
+                            null_map[row] = 1;
+                            result_column.insert_default();
+                            continue;
+                        }
+                        auto* root_val = match_value(parsed_paths, &document, 
allocator);
+                        if (root_val != nullptr) {
+                            value.CopyFrom(*root_val, allocator);
+                        } else {
+                            rapidjson::Value tmp;
+                            value.Swap(tmp);
+                        }
+                    }
+                    insert_result_lambda(value, row);
+                }
+            } else {
+                for (size_t row = 0; row < input_rows_count; row++) {
+                    value = parse_json(json_col, data_columns[1], allocator, 
row, 1,
+                                       column_is_consts);
+                    insert_result_lambda(value, row);
+                }
+            }
+
+        } else {
+            rapidjson::Value value;
+            value.SetArray();
+            value.Reserve(data_columns.size() - 1, allocator);
+            for (size_t row = 0; row < input_rows_count; row++) {
+                value.Clear();
+                for (size_t col = 1; col < data_columns.size(); ++col) {
+                    value.PushBack(parse_json(json_col, data_columns[col], 
allocator, row, col,
+                                              column_is_consts),
+                                   allocator);
+                }
+                insert_result_lambda(value, row);
+            }
         }
     }
 };
@@ -929,15 +997,18 @@ public:
                         size_t result, size_t input_rows_count) const override 
{
         auto result_column = ColumnString::create();
         auto null_map = ColumnUInt8::create(input_rows_count, 0);
-        std::vector<ColumnPtr> column_ptrs; // prevent converted column 
destruct
         std::vector<const ColumnString*> data_columns;
+        std::vector<bool> column_is_consts;
         for (int i = 0; i < arguments.size(); i++) {
-            column_ptrs.push_back(
-                    
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const());
-            data_columns.push_back(assert_cast<const 
ColumnString*>(column_ptrs.back().get()));
+            ColumnPtr arg_col;
+            bool arg_const;
+            std::tie(arg_col, arg_const) =
+                    
unpack_if_const(block.get_by_position(arguments[i]).column);
+            column_is_consts.push_back(arg_const);
+            data_columns.push_back(assert_cast<const 
ColumnString*>(arg_col.get()));
         }
         Impl::execute(data_columns, 
*assert_cast<ColumnString*>(result_column.get()),
-                      null_map->get_data(), input_rows_count);
+                      null_map->get_data(), input_rows_count, 
column_is_consts);
         block.replace_by_position(
                 result, ColumnNullable::create(std::move(result_column), 
std::move(null_map)));
         return Status::OK();


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to