Yukang-Lian commented on code in PR #13741:
URL: https://github.com/apache/doris/pull/13741#discussion_r1006821953


##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;

Review Comment:
   please reformat these two lines



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);

Review Comment:
   please remove redundant comments



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+

Review Comment:
   please remove redundant comments, too



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+
+        res_data_offsets.resize(input_rows_count);
+
+        /**
+         * 获得 argument参数(列数据),并存入argument_columns数组中,[0]为str,[1]为delimiter
+        */
+        size_t argument_size = arguments.size();
+        ColumnPtr argument_columns[argument_size];
+        for (size_t i = 0; i < argument_size; ++i) {
+            argument_columns[i] = 
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
+            if (auto* nullable = check_and_get_column<const 
ColumnNullable>(*argument_columns[i])) {
+                // Danger: Here must dispose the null map data first! Because
+                // argument_columns[i]=nullable->get_nested_column_ptr(); will 
release the mem
+                // of column nullable mem of null map
+                VectorizedUtils::update_null_map(null_map->get_data(), 
nullable->get_null_map_data());
+                argument_columns[i] = nullable->get_nested_column_ptr();
+            }
+        }
+        auto str_col = assert_cast<const 
ColumnString*>(argument_columns[0].get());
+        auto delimiter_col = assert_cast<const 
ColumnString*>(argument_columns[1].get());
+        
+        /**
+         * 取出列元素中的每一行(delimiter,str),并且进行相关的操作
+        */
+        for (size_t i = 0; i < input_rows_count; ++i) {    
+            auto delimiter = delimiter_col->get_data_at(i);
+            auto delimiter_str = delimiter_col->get_data_at(i).to_string();
+            auto str = str_col->get_data_at(i);
+            auto str_str = str_col->get_data_at(i).to_string();
+            if (delimiter.size == 0) {
+                res_data_offsets[i] = res_data_chars.size();
+            } else if (delimiter.size == 1) {
+                std::vector<int> v_offset;
+                std::vector<int> v_charlen;
+                getOffsetsAndLen(str_col->get_data_at(i).to_string(), 
delimiter_str, v_offset, v_charlen);
+                for (size_t i = 0; i < v_offset.size(); i++) {
+                    StringOP::push_value_string1(
+                            std::string_view {
+                                    reinterpret_cast<const char*>(str.data + 
v_offset[i] + 1),
+                                    (size_t)v_charlen[i] - 1},
+                            i, res_data_chars, res_data_offsets);
+                    //res_data_offsets.emplace_back(v_charlen[i]);
+                }
+                res_offsets.emplace_back(v_offset.size()); 
+
+            }
+             
+        }
+        //block.replace_by_position(result, std::move(col_res));

Review Comment:
   please remove redundant comments, too



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+
+        res_data_offsets.resize(input_rows_count);
+
+        /**
+         * 获得 argument参数(列数据),并存入argument_columns数组中,[0]为str,[1]为delimiter
+        */
+        size_t argument_size = arguments.size();
+        ColumnPtr argument_columns[argument_size];
+        for (size_t i = 0; i < argument_size; ++i) {
+            argument_columns[i] = 
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
+            if (auto* nullable = check_and_get_column<const 
ColumnNullable>(*argument_columns[i])) {
+                // Danger: Here must dispose the null map data first! Because
+                // argument_columns[i]=nullable->get_nested_column_ptr(); will 
release the mem
+                // of column nullable mem of null map
+                VectorizedUtils::update_null_map(null_map->get_data(), 
nullable->get_null_map_data());
+                argument_columns[i] = nullable->get_nested_column_ptr();
+            }
+        }
+        auto str_col = assert_cast<const 
ColumnString*>(argument_columns[0].get());
+        auto delimiter_col = assert_cast<const 
ColumnString*>(argument_columns[1].get());
+        
+        /**
+         * 取出列元素中的每一行(delimiter,str),并且进行相关的操作
+        */
+        for (size_t i = 0; i < input_rows_count; ++i) {    
+            auto delimiter = delimiter_col->get_data_at(i);
+            auto delimiter_str = delimiter_col->get_data_at(i).to_string();
+            auto str = str_col->get_data_at(i);
+            auto str_str = str_col->get_data_at(i).to_string();
+            if (delimiter.size == 0) {
+                res_data_offsets[i] = res_data_chars.size();
+            } else if (delimiter.size == 1) {
+                std::vector<int> v_offset;
+                std::vector<int> v_charlen;
+                getOffsetsAndLen(str_col->get_data_at(i).to_string(), 
delimiter_str, v_offset, v_charlen);
+                for (size_t i = 0; i < v_offset.size(); i++) {
+                    StringOP::push_value_string1(
+                            std::string_view {
+                                    reinterpret_cast<const char*>(str.data + 
v_offset[i] + 1),
+                                    (size_t)v_charlen[i] - 1},
+                            i, res_data_chars, res_data_offsets);
+                    //res_data_offsets.emplace_back(v_charlen[i]);

Review Comment:
   please remove redundant comments, too



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+
+        res_data_offsets.resize(input_rows_count);
+
+        /**
+         * 获得 argument参数(列数据),并存入argument_columns数组中,[0]为str,[1]为delimiter
+        */

Review Comment:
   please do not use chinese



##########
be/src/vec/functions/function_string.h:
##########
@@ -1159,6 +1166,129 @@ class FunctionSplitPart : public IFunction {
     }
 };
 
+
+class FunctionSplitByChar : public IFunction {
+
+private:
+    void getOffsetsAndLen(const std::string& s, const std::string& c, 
std::vector<int>& v_offset, std::vector<int>& v_charlen) {
+        /**
+         * 
+         * s : string need to be split
+         * c : delimiter_string
+         * v_offset  : each word splited offset in string
+         * v_charlen : each word length in string
+        */
+        char delimiter_char = c[0];
+        int32_t pos = 0;
+           int32_t pos_start = 0;
+           int32_t pos_end = 0;
+        int32_t len = s.size();
+        bool flag = true;
+
+           while (flag) {
+                   while (pos < len && s[pos] == delimiter_char) {
+                           pos++;
+                if (pos >= len - 1) {
+                    flag = false;
+                }
+            }
+
+            if (!flag || pos >= len) {
+                break;
+            }
+            pos_start = pos;
+            v_offset.emplace_back(pos_start);
+            while (pos < len && s[pos] != delimiter_char) {
+                pos++;
+            }
+            pos_end = pos;
+            v_charlen.emplace_back(pos_end - pos_start);
+        }
+    }
+public:
+    static constexpr auto name = "split_by_char";
+    static FunctionPtr create() { return 
std::make_shared<FunctionSplitByChar>(); }
+    String get_name() const override { return name; }
+    size_t get_number_of_arguments() const override { return 2; }
+
+    DataTypePtr get_return_type_impl(const DataTypes& arguments) const 
override {
+        return 
std::make_shared<DataTypeArray>(make_nullable(std::make_shared<DataTypeString>()));
+    }
+
+    bool use_default_implementation_for_nulls() const override { return false; 
}
+    bool use_default_implementation_for_constants() const override { return 
true; }
+
+    Status execute_impl(FunctionContext* context, Block& block, const 
ColumnNumbers& arguments,
+                        size_t result, size_t input_rows_count) override {
+        DCHECK_EQ(arguments.size(), 2);
+
+        auto null_map = ColumnUInt8::create(input_rows_count, 0);
+        //auto const_null_map = ColumnUInt8::create(input_rows_count, 0);
+        auto col_res = ColumnArray::create(ColumnString::create());
+
+        auto& res_data = typeid_cast<ColumnString &>(col_res->get_data());
+        auto& res_offsets = col_res->get_offsets();
+
+        auto& res_data_chars = res_data.get_chars();
+        auto& res_data_offsets = res_data.get_offsets();
+
+        //auto& null_map_data = null_map->get_data();
+
+        res_data_offsets.resize(input_rows_count);
+
+        /**
+         * 获得 argument参数(列数据),并存入argument_columns数组中,[0]为str,[1]为delimiter
+        */
+        size_t argument_size = arguments.size();
+        ColumnPtr argument_columns[argument_size];
+        for (size_t i = 0; i < argument_size; ++i) {
+            argument_columns[i] = 
block.get_by_position(arguments[i]).column->convert_to_full_column_if_const();
+            if (auto* nullable = check_and_get_column<const 
ColumnNullable>(*argument_columns[i])) {
+                // Danger: Here must dispose the null map data first! Because
+                // argument_columns[i]=nullable->get_nested_column_ptr(); will 
release the mem
+                // of column nullable mem of null map
+                VectorizedUtils::update_null_map(null_map->get_data(), 
nullable->get_null_map_data());
+                argument_columns[i] = nullable->get_nested_column_ptr();
+            }
+        }
+        auto str_col = assert_cast<const 
ColumnString*>(argument_columns[0].get());
+        auto delimiter_col = assert_cast<const 
ColumnString*>(argument_columns[1].get());
+        
+        /**
+         * 取出列元素中的每一行(delimiter,str),并且进行相关的操作
+        */

Review Comment:
   please do not use chinese,too



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to