This is an automated email from the ASF dual-hosted git repository. xuyang pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 56b2fc43d4 [enhancement](array-type) shrink column suffix zero for type ARRAY<CHAR> (#12443) 56b2fc43d4 is described below commit 56b2fc43d4b5a91b693a9025e3dfc7a553e41781 Author: camby <104178...@qq.com> AuthorDate: Tue Sep 13 23:24:48 2022 +0800 [enhancement](array-type) shrink column suffix zero for type ARRAY<CHAR> (#12443) In compute level, CHAR type will shrink suffix zeros. To keep the logic the same as CHAR type, we also shrink for ARRAY or ARRAY<ARRAY> types. Co-authored-by: cambyzju <zhuxiaol...@baidu.com> --- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 13 ++++++++++--- be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +- be/src/vec/columns/column.h | 8 ++++++++ be/src/vec/columns/column_array.cpp | 4 ++++ be/src/vec/columns/column_array.h | 3 +++ be/src/vec/columns/column_nullable.cpp | 5 +++++ be/src/vec/columns/column_nullable.h | 3 +++ be/src/vec/columns/column_string.cpp | 12 ++++++++++++ be/src/vec/columns/column_string.h | 12 ++---------- be/src/vec/core/block.cpp | 19 +++---------------- be/src/vec/core/block.h | 1 + .../data/load/insert/test_array_insert.out | Bin 1266 -> 1114 bytes .../data/load/insert/test_array_string_insert.out | Bin 397 -> 373 bytes .../array_functions/test_array_functions.out | 18 ++++++++++++++++++ .../array_functions/test_array_functions.groovy | 21 ++++++++++++--------- 15 files changed, 82 insertions(+), 39 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index a92cfa8fab..376545b067 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -872,9 +872,16 @@ void SegmentIterator::_vec_init_char_column_id() { auto cid = _schema.column_id(i); auto column_desc = _schema.column(cid); - if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) { - _char_type_idx.emplace_back(i); - } + do { + if (column_desc->type() == OLAP_FIELD_TYPE_CHAR) { + _char_type_idx.emplace_back(i); + break; + } else if (column_desc->type() != OLAP_FIELD_TYPE_ARRAY) { + break; + } + // for Array<Char> or Array<Array<Char>> + column_desc = column_desc->get_sub_field(0); + } while (column_desc != nullptr); } } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index e57c43b597..2454167a33 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -205,7 +205,7 @@ private: io::FileReaderSPtr _file_reader; - // char_type columns cid + // char_type or array<char> type columns cid std::vector<size_t> _char_type_idx; // number of rows read in the current batch diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 6d085fa7c7..3e4bc62cc0 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -124,6 +124,12 @@ public: return nullptr; } + // shrink the end zeros for CHAR type or ARRAY<CHAR> type + virtual MutablePtr get_shinked_column() { + LOG(FATAL) << "Cannot clone_resized() column " << get_name(); + return nullptr; + } + /// Returns number of values in column. virtual size_t size() const = 0; @@ -545,6 +551,8 @@ public: virtual bool is_column_dictionary() const { return false; } + virtual bool is_column_array() const { return false; } + /// If the only value column can contain is NULL. /// Does not imply type of object, because it can be ColumnNullable(ColumnNothing) or ColumnConst(ColumnNullable(ColumnNothing)) virtual bool only_null() const { return false; } diff --git a/be/src/vec/columns/column_array.cpp b/be/src/vec/columns/column_array.cpp index a589482c22..473cde0982 100644 --- a/be/src/vec/columns/column_array.cpp +++ b/be/src/vec/columns/column_array.cpp @@ -77,6 +77,10 @@ ColumnArray::ColumnArray(MutableColumnPtr&& nested_column) : data(std::move(nest offsets = ColumnOffsets::create(); } +MutableColumnPtr ColumnArray::get_shinked_column() { + return ColumnArray::create(data->get_shinked_column(), offsets->assume_mutable()); +} + std::string ColumnArray::get_name() const { return "Array(" + get_data().get_name() + ")"; } diff --git a/be/src/vec/columns/column_array.h b/be/src/vec/columns/column_array.h index d3bcb924f5..2f8df6d83d 100644 --- a/be/src/vec/columns/column_array.h +++ b/be/src/vec/columns/column_array.h @@ -78,11 +78,14 @@ public: return Base::create(std::forward<Args>(args)...); } + MutableColumnPtr get_shinked_column() override; + /** On the index i there is an offset to the beginning of the i + 1 -th element. */ using ColumnOffsets = ColumnVector<Offset64>; std::string get_name() const override; const char* get_family_name() const override { return "Array"; } + bool is_column_array() const override { return true; } bool can_be_inside_nullable() const override { return true; } TypeIndex get_data_type() const { return TypeIndex::Array; } MutableColumnPtr clone_resized(size_t size) const override; diff --git a/be/src/vec/columns/column_nullable.cpp b/be/src/vec/columns/column_nullable.cpp index 29b1887421..b6bf95f449 100644 --- a/be/src/vec/columns/column_nullable.cpp +++ b/be/src/vec/columns/column_nullable.cpp @@ -45,6 +45,11 @@ ColumnNullable::ColumnNullable(MutableColumnPtr&& nested_column_, MutableColumnP } } +MutableColumnPtr ColumnNullable::get_shinked_column() { + return ColumnNullable::create(get_nested_column_ptr()->get_shinked_column(), + get_null_map_column_ptr()); +} + void ColumnNullable::update_hash_with_value(size_t n, SipHash& hash) const { if (is_null_at(n)) hash.update(0); diff --git a/be/src/vec/columns/column_nullable.h b/be/src/vec/columns/column_nullable.h index 523ff337ae..cb399ab761 100644 --- a/be/src/vec/columns/column_nullable.h +++ b/be/src/vec/columns/column_nullable.h @@ -67,6 +67,8 @@ public: return Base::create(std::forward<Args>(args)...); } + MutableColumnPtr get_shinked_column() override; + const char* get_family_name() const override { return "Nullable"; } std::string get_name() const override { return "Nullable(" + nested_column->get_name() + ")"; } MutableColumnPtr clone_resized(size_t size) const override; @@ -199,6 +201,7 @@ public: bool is_bitmap() const override { return get_nested_column().is_bitmap(); } bool is_column_decimal() const override { return get_nested_column().is_column_decimal(); } bool is_column_string() const override { return get_nested_column().is_column_string(); } + bool is_column_array() const override { return get_nested_column().is_column_array(); } bool is_fixed_and_contiguous() const override { return false; } bool values_have_fixed_size() const override { return nested_column->values_have_fixed_size(); } size_t size_of_value_if_fixed() const override { diff --git a/be/src/vec/columns/column_string.cpp b/be/src/vec/columns/column_string.cpp index 20f0d3c534..ccc1074e29 100644 --- a/be/src/vec/columns/column_string.cpp +++ b/be/src/vec/columns/column_string.cpp @@ -64,6 +64,18 @@ MutableColumnPtr ColumnString::clone_resized(size_t to_size) const { return res; } +MutableColumnPtr ColumnString::get_shinked_column() { + auto shrinked_column = ColumnString::create(); + shrinked_column->get_offsets().reserve(offsets.size()); + shrinked_column->get_chars().reserve(chars.size()); + for (int i = 0; i < size(); i++) { + StringRef str = get_data_at(i); + reinterpret_cast<ColumnString*>(shrinked_column.get()) + ->insert_data(str.data, strnlen(str.data, str.size)); + } + return shrinked_column; +} + void ColumnString::insert_range_from(const IColumn& src, size_t start, size_t length) { if (length == 0) return; diff --git a/be/src/vec/columns/column_string.h b/be/src/vec/columns/column_string.h index ee8e6cfdd3..2d972ac980 100644 --- a/be/src/vec/columns/column_string.h +++ b/be/src/vec/columns/column_string.h @@ -84,6 +84,8 @@ public: MutableColumnPtr clone_resized(size_t to_size) const override; + MutableColumnPtr get_shinked_column() override; + Field operator[](size_t n) const override { assert(n < size()); return Field(&chars[offset_at(n)], size_at(n) - 1); @@ -374,16 +376,6 @@ public: chars.emplace_back(0); } - - MutableColumnPtr get_shinked_column() const { - auto shrinked_column = ColumnString::create(); - for (int i = 0; i < size(); i++) { - StringRef str = get_data_at(i); - reinterpret_cast<ColumnString*>(shrinked_column.get()) - ->insert_data(str.data, strnlen(str.data, str.size)); - } - return shrinked_column; - } }; } // namespace doris::vectorized diff --git a/be/src/vec/core/block.cpp b/be/src/vec/core/block.cpp index cefa2b0b39..af90143955 100644 --- a/be/src/vec/core/block.cpp +++ b/be/src/vec/core/block.cpp @@ -1063,25 +1063,12 @@ std::unique_ptr<Block> Block::create_same_struct_block(size_t size) const { void Block::shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx) { for (auto idx : char_type_idx) { if (idx < data.size()) { - if (this->get_by_position(idx).column->is_nullable()) { - this->get_by_position(idx).column = ColumnNullable::create( - reinterpret_cast<const ColumnString*>( - reinterpret_cast<const ColumnNullable*>( - this->get_by_position(idx).column.get()) - ->get_nested_column_ptr() - .get()) - ->get_shinked_column(), - reinterpret_cast<const ColumnNullable*>( - this->get_by_position(idx).column.get()) - ->get_null_map_column_ptr()); - } else { - this->get_by_position(idx).column = reinterpret_cast<const ColumnString*>( - this->get_by_position(idx).column.get()) - ->get_shinked_column(); - } + auto& col_and_name = this->get_by_position(idx); + col_and_name.column = col_and_name.column->assume_mutable()->get_shinked_column(); } } } + size_t MutableBlock::allocated_bytes() const { size_t res = 0; for (const auto& col : _columns) { diff --git a/be/src/vec/core/block.h b/be/src/vec/core/block.h index aa603a1800..75813df153 100644 --- a/be/src/vec/core/block.h +++ b/be/src/vec/core/block.h @@ -351,6 +351,7 @@ public: doris::Tuple* deep_copy_tuple(const TupleDescriptor&, MemPool*, int, int, bool padding_char = false); + // for String type or Array<String> type void shrink_char_type_column_suffix_zero(const std::vector<size_t>& char_type_idx); int64_t get_decompress_time() const { return _decompress_time_ns; } diff --git a/regression-test/data/load/insert/test_array_insert.out b/regression-test/data/load/insert/test_array_insert.out index 18f66e59fe..2bb7e044a1 100644 Binary files a/regression-test/data/load/insert/test_array_insert.out and b/regression-test/data/load/insert/test_array_insert.out differ diff --git a/regression-test/data/load/insert/test_array_string_insert.out b/regression-test/data/load/insert/test_array_string_insert.out index ff69c931ae..7a31b4210e 100644 Binary files a/regression-test/data/load/insert/test_array_string_insert.out and b/regression-test/data/load/insert/test_array_string_insert.out differ diff --git a/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out b/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out index 518865780d..2a2f2c933b 100644 --- a/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out +++ b/regression-test/data/query_p0/sql_functions/array_functions/test_array_functions.out @@ -116,3 +116,21 @@ 6 1_2_3_4_5_4_3_2_1 a-b-c-d-c-b-a 7 8_9_null_10_null f-null-g-null-h +-- !select -- +1 true +2 false +3 false +4 \N +5 \N +6 \N +7 \N + +-- !select -- +1 false +2 false +3 false +4 \N +5 \N +6 \N +7 \N + diff --git a/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy b/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy index c374a43e51..a4f6c60dd4 100644 --- a/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy +++ b/regression-test/suites/query_p0/sql_functions/array_functions/test_array_functions.groovy @@ -23,12 +23,13 @@ suite("test_array_functions") { sql """ set enable_vectorized_engine = true """ sql """DROP TABLE IF EXISTS ${tableName}""" - sql """ + sql """ CREATE TABLE IF NOT EXISTS ${tableName} ( `k1` int(11) NULL COMMENT "", `k2` ARRAY<int(11)> NOT NULL COMMENT "", `k3` ARRAY<VARCHAR(20)> NULL COMMENT "", - `k4` ARRAY<int(11)> NULL COMMENT "" + `k4` ARRAY<int(11)> NULL COMMENT "", + `k5` ARRAY<CHAR(5)> NULL COMMENT "" ) ENGINE=OLAP DUPLICATE KEY(`k1`) DISTRIBUTED BY HASH(`k1`) BUCKETS 1 @@ -37,13 +38,13 @@ suite("test_array_functions") { "storage_format" = "V2" ) """ - sql """ INSERT INTO ${tableName} VALUES(1, [1, 2, 3], ["a", "b", ""], [1, 2]) """ - sql """ INSERT INTO ${tableName} VALUES(2, [4], NULL, [5]) """ - sql """ INSERT INTO ${tableName} VALUES(3, [], [], NULL) """ - sql """ INSERT INTO ${tableName} VALUES(4, [1, 2, 3, 4, 5, 4, 3, 2, 1], [], []) """ - sql """ INSERT INTO ${tableName} VALUES(5, [], ["a", "b", "c", "d", "c", "b", "a"], NULL) """ - sql """ INSERT INTO ${tableName} VALUES(6, [1, 2, 3, 4, 5, 4, 3, 2, 1], ["a", "b", "c", "d", "c", "b", "a"], NULL) """ - sql """ INSERT INTO ${tableName} VALUES(7, [8, 9, NULL, 10, NULL], ["f", NULL, "g", NULL, "h"], NULL) """ + sql """ INSERT INTO ${tableName} VALUES(1,[1,2,3],["a","b",""],[1,2],["hi"]) """ + sql """ INSERT INTO ${tableName} VALUES(2,[4],NULL,[5],["hi2"]) """ + sql """ INSERT INTO ${tableName} VALUES(3,[],[],NULL,["hi3"]) """ + sql """ INSERT INTO ${tableName} VALUES(4,[1,2,3,4,5,4,3,2,1],[],[],NULL) """ + sql """ INSERT INTO ${tableName} VALUES(5,[],["a","b","c","d","c","b","a"],NULL,NULL) """ + sql """ INSERT INTO ${tableName} VALUES(6,[1,2,3,4,5,4,3,2,1],["a","b","c","d","c","b","a"],NULL,NULL) """ + sql """ INSERT INTO ${tableName} VALUES(7,[8,9,NULL,10,NULL],["f",NULL,"g",NULL,"h"],NULL,NULL) """ qt_select "SELECT k1, size(k2), size(k3) FROM ${tableName} ORDER BY k1" qt_select "SELECT k1, cardinality(k2), cardinality(k3) FROM ${tableName} ORDER BY k1" @@ -58,4 +59,6 @@ suite("test_array_functions") { qt_select "SELECT k1, array_slice(k2, 1, 2) FROM ${tableName} ORDER BY k1" qt_select "SELECT k1, reverse(k2), reverse(k3), reverse(k4) FROM ${tableName} ORDER BY k1" qt_select "SELECT k1, array_join(k2, '_', 'null'), array_join(k3, '-', 'null') FROM ${tableName} ORDER BY k1" + qt_select "SELECT k1, array_contains(k5, 'hi') FROM ${tableName} ORDER BY k1" + qt_select "SELECT k1, array_contains(k5, 'hi222') FROM ${tableName} ORDER BY k1" } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org