This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-1.2-unstable in repository https://gitbox.apache.org/repos/asf/doris.git
commit f4b4e603490aaeb28635e532a0de51e03965ade3 Author: Pxl <pxl...@qq.com> AuthorDate: Tue Nov 8 10:59:35 2022 +0800 [Enhancement](Dictionary-codec) update dict once on same segment (#13936) update dict once on same segment --- be/src/olap/in_list_predicate.h | 17 ++++++++++++++--- be/src/olap/rowset/beta_rowset.cpp | 9 ++++++--- be/src/olap/rowset/segment_v2/segment.cpp | 11 +++++++---- be/src/olap/rowset/segment_v2/segment.h | 7 +++++-- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 2 ++ be/src/vec/columns/column.h | 6 ++++++ be/src/vec/columns/column_dictionary.h | 9 +++++++++ be/test/io/cache/remote_file_cache_test.cpp | 2 +- be/test/olap/rowset/segment_v2/segment_test.cpp | 4 ++-- be/test/tools/benchmark_tool.cpp | 2 +- 10 files changed, 53 insertions(+), 16 deletions(-) diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 32b0dc2fb0..b85deeb39c 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -20,11 +20,13 @@ #include <parallel_hashmap/phmap.h> #include <stdint.h> +#include <cstdint> #include <roaring/roaring.hh> #include <type_traits> #include "decimal12.h" #include "olap/column_predicate.h" +#include "olap/olap_common.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/wrapper_field.h" #include "runtime/define_primitive_type.h" @@ -400,7 +402,11 @@ private: auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(column); auto& data_array = nested_col_ptr->get_data(); - nested_col_ptr->find_codes(_values, _value_in_dict_flags); + auto& _value_in_dict_flags = + _segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()]; + if (_value_in_dict_flags.empty()) { + nested_col_ptr->find_codes(_values, _value_in_dict_flags); + } for (uint16_t i = 0; i < size; i++) { uint16_t idx = sel[i]; @@ -469,7 +475,11 @@ private: auto* nested_col_ptr = vectorized::check_and_get_column< vectorized::ColumnDictionary<vectorized::Int32>>(column); auto& data_array = nested_col_ptr->get_data(); - nested_col_ptr->find_codes(_values, _value_in_dict_flags); + auto& _value_in_dict_flags = + _segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()]; + if (_value_in_dict_flags.empty()) { + nested_col_ptr->find_codes(_values, _value_in_dict_flags); + } for (uint16_t i = 0; i < size; i++) { if (is_and ^ flags[i]) { @@ -543,7 +553,8 @@ private: } phmap::flat_hash_set<T> _values; - mutable std::vector<vectorized::UInt8> _value_in_dict_flags; + mutable std::map<std::pair<RowsetId, uint32_t>, std::vector<vectorized::UInt8>> + _segment_id_to_value_in_dict_flags; T _min_value; T _max_value; static constexpr PrimitiveType EvalType = (Type == TYPE_CHAR ? TYPE_STRING : Type); diff --git a/be/src/olap/rowset/beta_rowset.cpp b/be/src/olap/rowset/beta_rowset.cpp index fdb7b90496..8ed48fdf70 100644 --- a/be/src/olap/rowset/beta_rowset.cpp +++ b/be/src/olap/rowset/beta_rowset.cpp @@ -118,7 +118,8 @@ Status BetaRowset::load_segments(std::vector<segment_v2::SegmentSharedPtr>* segm auto seg_path = segment_file_path(seg_id); auto cache_path = segment_cache_path(seg_id); std::shared_ptr<segment_v2::Segment> segment; - auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, _schema, &segment); + auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, rowset_id(), _schema, + &segment); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << seg_path << " under rowset " << unique_id() << " : " << s.to_string(); @@ -137,7 +138,8 @@ Status BetaRowset::load_segment(int64_t seg_id, segment_v2::SegmentSharedPtr* se } auto seg_path = segment_file_path(seg_id); auto cache_path = segment_cache_path(seg_id); - auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, _schema, segment); + auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, rowset_id(), _schema, + segment); if (!s.ok()) { LOG(WARNING) << "failed to open segment. " << seg_path << " under rowset " << unique_id() << " : " << s.to_string(); @@ -304,7 +306,8 @@ bool BetaRowset::check_current_rowset_segment() { auto seg_path = segment_file_path(seg_id); auto cache_path = segment_cache_path(seg_id); std::shared_ptr<segment_v2::Segment> segment; - auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, _schema, &segment); + auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, rowset_id(), _schema, + &segment); if (!s.ok()) { LOG(WARNING) << "segment can not be opened. file=" << seg_path; return false; diff --git a/be/src/olap/rowset/segment_v2/segment.cpp b/be/src/olap/rowset/segment_v2/segment.cpp index 056665d295..baec276993 100644 --- a/be/src/olap/rowset/segment_v2/segment.cpp +++ b/be/src/olap/rowset/segment_v2/segment.cpp @@ -43,9 +43,9 @@ namespace segment_v2 { using io::FileCacheManager; Status Segment::open(io::FileSystemSPtr fs, const std::string& path, const std::string& cache_path, - uint32_t segment_id, TabletSchemaSPtr tablet_schema, + uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, std::shared_ptr<Segment>* output) { - std::shared_ptr<Segment> segment(new Segment(segment_id, tablet_schema)); + std::shared_ptr<Segment> segment(new Segment(segment_id, rowset_id, tablet_schema)); io::FileReaderSPtr file_reader; #ifndef BE_TEST RETURN_IF_ERROR(fs->open_file(path, &file_reader)); @@ -71,8 +71,11 @@ Status Segment::open(io::FileSystemSPtr fs, const std::string& path, const std:: return Status::OK(); } -Segment::Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema) - : _segment_id(segment_id), _tablet_schema(tablet_schema), _meta_mem_usage(0) {} +Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema) + : _segment_id(segment_id), + _rowset_id(rowset_id), + _tablet_schema(tablet_schema), + _meta_mem_usage(0) {} Segment::~Segment() { #ifndef BE_TEST diff --git a/be/src/olap/rowset/segment_v2/segment.h b/be/src/olap/rowset/segment_v2/segment.h index a3dcd8c6c6..96fbb60335 100644 --- a/be/src/olap/rowset/segment_v2/segment.h +++ b/be/src/olap/rowset/segment_v2/segment.h @@ -62,7 +62,7 @@ using SegmentSharedPtr = std::shared_ptr<Segment>; class Segment : public std::enable_shared_from_this<Segment> { public: static Status open(io::FileSystemSPtr fs, const std::string& path, - const std::string& cache_path, uint32_t segment_id, + const std::string& cache_path, uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema, std::shared_ptr<Segment>* output); ~Segment(); @@ -72,6 +72,8 @@ public: uint64_t id() const { return _segment_id; } + RowsetId rowset_id() const { return _rowset_id; } + uint32_t num_rows() const { return _footer.num_rows(); } Status new_column_iterator(const TabletColumn& tablet_column, ColumnIterator** iter); @@ -108,7 +110,7 @@ public: private: DISALLOW_COPY_AND_ASSIGN(Segment); - Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema); + Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr tablet_schema); // open segment file and read the minimum amount of necessary information (footer) Status _open(); Status _parse_footer(); @@ -120,6 +122,7 @@ private: io::FileReaderSPtr _file_reader; uint32_t _segment_id; + RowsetId _rowset_id; TabletSchemaSPtr _tablet_schema; int64_t _meta_mem_usage; diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 1f8e24a2b2..a85c0e35db 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -1229,6 +1229,8 @@ void SegmentIterator::_convert_dict_code_for_predicate_if_necessary_impl( ColumnPredicate* predicate) { auto& column = _current_return_columns[predicate->column_id()]; auto* col_ptr = column.get(); + column->set_rowset_segment_id({_segment->rowset_id(), _segment->id()}); + if (PredicateTypeTraits::is_range(predicate->type())) { col_ptr->convert_dict_codes_if_necessary(); } else if (PredicateTypeTraits::is_bloom_filter(predicate->type())) { diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h index 4b89a002af..c5eeb18d8f 100644 --- a/be/src/vec/columns/column.h +++ b/be/src/vec/columns/column.h @@ -20,6 +20,7 @@ #pragma once +#include "olap/olap_common.h" #include "runtime/define_primitive_type.h" #include "vec/common/cow.h" #include "vec/common/pod_array_fwd.h" @@ -129,6 +130,11 @@ public: return nullptr; } + // Only used on ColumnDictionary + virtual void set_rowset_segment_id(std::pair<RowsetId, uint32_t> rowset_segment_id) {} + + virtual std::pair<RowsetId, uint32_t> get_rowset_segment_id() const { return {}; } + /// Returns number of values in column. virtual size_t size() const = 0; diff --git a/be/src/vec/columns/column_dictionary.h b/be/src/vec/columns/column_dictionary.h index b8976e77c5..681ed20852 100644 --- a/be/src/vec/columns/column_dictionary.h +++ b/be/src/vec/columns/column_dictionary.h @@ -262,6 +262,14 @@ public: return _dict.find_codes(values, selected); } + void set_rowset_segment_id(std::pair<RowsetId, uint32_t> rowset_segment_id) override { + _rowset_segment_id = rowset_segment_id; + } + + std::pair<RowsetId, uint32_t> get_rowset_segment_id() const override { + return _rowset_segment_id; + } + bool is_dict_sorted() const { return _dict_sorted; } bool is_dict_code_converted() const { return _dict_code_converted; } @@ -451,6 +459,7 @@ private: Dictionary _dict; Container _codes; FieldType _type; + std::pair<RowsetId, uint32_t> _rowset_segment_id; }; template class ColumnDictionary<int32_t>; diff --git a/be/test/io/cache/remote_file_cache_test.cpp b/be/test/io/cache/remote_file_cache_test.cpp index a88cf2fbf8..5de0a5f955 100644 --- a/be/test/io/cache/remote_file_cache_test.cpp +++ b/be/test/io/cache/remote_file_cache_test.cpp @@ -142,7 +142,7 @@ protected: EXPECT_EQ("", writer.min_encoded_key().to_string()); EXPECT_EQ("", writer.max_encoded_key().to_string()); - st = segment_v2::Segment::open(fs, path, "", 0, query_schema, res); + st = segment_v2::Segment::open(fs, path, "", 0, {}, query_schema, res); EXPECT_TRUE(st.ok()); EXPECT_EQ(nrows, (*res)->num_rows()); } diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index ac2d8febfd..e793d4acd3 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -175,7 +175,7 @@ protected: EXPECT_EQ("", writer.max_encoded_key().to_string()); } - st = Segment::open(fs, path, "", 0, query_schema, res); + st = Segment::open(fs, path, "", 0, {}, query_schema, res); EXPECT_TRUE(st.ok()); EXPECT_EQ(nrows, (*res)->num_rows()); } @@ -774,7 +774,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { { std::shared_ptr<Segment> segment; - st = Segment::open(fs, fname, "", 0, tablet_schema, &segment); + st = Segment::open(fs, fname, "", 0, {}, tablet_schema, &segment); EXPECT_TRUE(st.ok()); EXPECT_EQ(4096, segment->num_rows()); Schema schema(tablet_schema); diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp index b5c1280796..7b6a296e13 100644 --- a/be/test/tools/benchmark_tool.cpp +++ b/be/test/tools/benchmark_tool.cpp @@ -364,7 +364,7 @@ public: writer.finalize(&file_size, &index_size); file_writer->close(); - Segment::open(fs, path, "", seg_id, &_tablet_schema, res); + Segment::open(fs, path, "", seg_id, {}, &_tablet_schema, res); } std::vector<std::vector<std::string>> generate_dataset(int rows_number) { --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org