This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch branch-1.2-unstable
in repository https://gitbox.apache.org/repos/asf/doris.git

commit f4b4e603490aaeb28635e532a0de51e03965ade3
Author: Pxl <pxl...@qq.com>
AuthorDate: Tue Nov 8 10:59:35 2022 +0800

    [Enhancement](Dictionary-codec) update dict once on same segment (#13936)
    
    update dict once on same segment
---
 be/src/olap/in_list_predicate.h                    | 17 ++++++++++++++---
 be/src/olap/rowset/beta_rowset.cpp                 |  9 ++++++---
 be/src/olap/rowset/segment_v2/segment.cpp          | 11 +++++++----
 be/src/olap/rowset/segment_v2/segment.h            |  7 +++++--
 be/src/olap/rowset/segment_v2/segment_iterator.cpp |  2 ++
 be/src/vec/columns/column.h                        |  6 ++++++
 be/src/vec/columns/column_dictionary.h             |  9 +++++++++
 be/test/io/cache/remote_file_cache_test.cpp        |  2 +-
 be/test/olap/rowset/segment_v2/segment_test.cpp    |  4 ++--
 be/test/tools/benchmark_tool.cpp                   |  2 +-
 10 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 32b0dc2fb0..b85deeb39c 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -20,11 +20,13 @@
 #include <parallel_hashmap/phmap.h>
 #include <stdint.h>
 
+#include <cstdint>
 #include <roaring/roaring.hh>
 #include <type_traits>
 
 #include "decimal12.h"
 #include "olap/column_predicate.h"
+#include "olap/olap_common.h"
 #include "olap/rowset/segment_v2/bloom_filter.h"
 #include "olap/wrapper_field.h"
 #include "runtime/define_primitive_type.h"
@@ -400,7 +402,11 @@ private:
                 auto* nested_col_ptr = vectorized::check_and_get_column<
                         
vectorized::ColumnDictionary<vectorized::Int32>>(column);
                 auto& data_array = nested_col_ptr->get_data();
-                nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+                auto& _value_in_dict_flags =
+                        
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];
+                if (_value_in_dict_flags.empty()) {
+                    nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+                }
 
                 for (uint16_t i = 0; i < size; i++) {
                     uint16_t idx = sel[i];
@@ -469,7 +475,11 @@ private:
                 auto* nested_col_ptr = vectorized::check_and_get_column<
                         
vectorized::ColumnDictionary<vectorized::Int32>>(column);
                 auto& data_array = nested_col_ptr->get_data();
-                nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+                auto& _value_in_dict_flags =
+                        
_segment_id_to_value_in_dict_flags[column->get_rowset_segment_id()];
+                if (_value_in_dict_flags.empty()) {
+                    nested_col_ptr->find_codes(_values, _value_in_dict_flags);
+                }
 
                 for (uint16_t i = 0; i < size; i++) {
                     if (is_and ^ flags[i]) {
@@ -543,7 +553,8 @@ private:
     }
 
     phmap::flat_hash_set<T> _values;
-    mutable std::vector<vectorized::UInt8> _value_in_dict_flags;
+    mutable std::map<std::pair<RowsetId, uint32_t>, 
std::vector<vectorized::UInt8>>
+            _segment_id_to_value_in_dict_flags;
     T _min_value;
     T _max_value;
     static constexpr PrimitiveType EvalType = (Type == TYPE_CHAR ? TYPE_STRING 
: Type);
diff --git a/be/src/olap/rowset/beta_rowset.cpp 
b/be/src/olap/rowset/beta_rowset.cpp
index fdb7b90496..8ed48fdf70 100644
--- a/be/src/olap/rowset/beta_rowset.cpp
+++ b/be/src/olap/rowset/beta_rowset.cpp
@@ -118,7 +118,8 @@ Status 
BetaRowset::load_segments(std::vector<segment_v2::SegmentSharedPtr>* segm
         auto seg_path = segment_file_path(seg_id);
         auto cache_path = segment_cache_path(seg_id);
         std::shared_ptr<segment_v2::Segment> segment;
-        auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
_schema, &segment);
+        auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
rowset_id(), _schema,
+                                           &segment);
         if (!s.ok()) {
             LOG(WARNING) << "failed to open segment. " << seg_path << " under 
rowset "
                          << unique_id() << " : " << s.to_string();
@@ -137,7 +138,8 @@ Status BetaRowset::load_segment(int64_t seg_id, 
segment_v2::SegmentSharedPtr* se
     }
     auto seg_path = segment_file_path(seg_id);
     auto cache_path = segment_cache_path(seg_id);
-    auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
_schema, segment);
+    auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
rowset_id(), _schema,
+                                       segment);
     if (!s.ok()) {
         LOG(WARNING) << "failed to open segment. " << seg_path << " under 
rowset " << unique_id()
                      << " : " << s.to_string();
@@ -304,7 +306,8 @@ bool BetaRowset::check_current_rowset_segment() {
         auto seg_path = segment_file_path(seg_id);
         auto cache_path = segment_cache_path(seg_id);
         std::shared_ptr<segment_v2::Segment> segment;
-        auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
_schema, &segment);
+        auto s = segment_v2::Segment::open(fs, seg_path, cache_path, seg_id, 
rowset_id(), _schema,
+                                           &segment);
         if (!s.ok()) {
             LOG(WARNING) << "segment can not be opened. file=" << seg_path;
             return false;
diff --git a/be/src/olap/rowset/segment_v2/segment.cpp 
b/be/src/olap/rowset/segment_v2/segment.cpp
index 056665d295..baec276993 100644
--- a/be/src/olap/rowset/segment_v2/segment.cpp
+++ b/be/src/olap/rowset/segment_v2/segment.cpp
@@ -43,9 +43,9 @@ namespace segment_v2 {
 using io::FileCacheManager;
 
 Status Segment::open(io::FileSystemSPtr fs, const std::string& path, const 
std::string& cache_path,
-                     uint32_t segment_id, TabletSchemaSPtr tablet_schema,
+                     uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr 
tablet_schema,
                      std::shared_ptr<Segment>* output) {
-    std::shared_ptr<Segment> segment(new Segment(segment_id, tablet_schema));
+    std::shared_ptr<Segment> segment(new Segment(segment_id, rowset_id, 
tablet_schema));
     io::FileReaderSPtr file_reader;
 #ifndef BE_TEST
     RETURN_IF_ERROR(fs->open_file(path, &file_reader));
@@ -71,8 +71,11 @@ Status Segment::open(io::FileSystemSPtr fs, const 
std::string& path, const std::
     return Status::OK();
 }
 
-Segment::Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema)
-        : _segment_id(segment_id), _tablet_schema(tablet_schema), 
_meta_mem_usage(0) {}
+Segment::Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr 
tablet_schema)
+        : _segment_id(segment_id),
+          _rowset_id(rowset_id),
+          _tablet_schema(tablet_schema),
+          _meta_mem_usage(0) {}
 
 Segment::~Segment() {
 #ifndef BE_TEST
diff --git a/be/src/olap/rowset/segment_v2/segment.h 
b/be/src/olap/rowset/segment_v2/segment.h
index a3dcd8c6c6..96fbb60335 100644
--- a/be/src/olap/rowset/segment_v2/segment.h
+++ b/be/src/olap/rowset/segment_v2/segment.h
@@ -62,7 +62,7 @@ using SegmentSharedPtr = std::shared_ptr<Segment>;
 class Segment : public std::enable_shared_from_this<Segment> {
 public:
     static Status open(io::FileSystemSPtr fs, const std::string& path,
-                       const std::string& cache_path, uint32_t segment_id,
+                       const std::string& cache_path, uint32_t segment_id, 
RowsetId rowset_id,
                        TabletSchemaSPtr tablet_schema, 
std::shared_ptr<Segment>* output);
 
     ~Segment();
@@ -72,6 +72,8 @@ public:
 
     uint64_t id() const { return _segment_id; }
 
+    RowsetId rowset_id() const { return _rowset_id; }
+
     uint32_t num_rows() const { return _footer.num_rows(); }
 
     Status new_column_iterator(const TabletColumn& tablet_column, 
ColumnIterator** iter);
@@ -108,7 +110,7 @@ public:
 
 private:
     DISALLOW_COPY_AND_ASSIGN(Segment);
-    Segment(uint32_t segment_id, TabletSchemaSPtr tablet_schema);
+    Segment(uint32_t segment_id, RowsetId rowset_id, TabletSchemaSPtr 
tablet_schema);
     // open segment file and read the minimum amount of necessary information 
(footer)
     Status _open();
     Status _parse_footer();
@@ -120,6 +122,7 @@ private:
     io::FileReaderSPtr _file_reader;
 
     uint32_t _segment_id;
+    RowsetId _rowset_id;
     TabletSchemaSPtr _tablet_schema;
 
     int64_t _meta_mem_usage;
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index 1f8e24a2b2..a85c0e35db 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -1229,6 +1229,8 @@ void 
SegmentIterator::_convert_dict_code_for_predicate_if_necessary_impl(
         ColumnPredicate* predicate) {
     auto& column = _current_return_columns[predicate->column_id()];
     auto* col_ptr = column.get();
+    column->set_rowset_segment_id({_segment->rowset_id(), _segment->id()});
+
     if (PredicateTypeTraits::is_range(predicate->type())) {
         col_ptr->convert_dict_codes_if_necessary();
     } else if (PredicateTypeTraits::is_bloom_filter(predicate->type())) {
diff --git a/be/src/vec/columns/column.h b/be/src/vec/columns/column.h
index 4b89a002af..c5eeb18d8f 100644
--- a/be/src/vec/columns/column.h
+++ b/be/src/vec/columns/column.h
@@ -20,6 +20,7 @@
 
 #pragma once
 
+#include "olap/olap_common.h"
 #include "runtime/define_primitive_type.h"
 #include "vec/common/cow.h"
 #include "vec/common/pod_array_fwd.h"
@@ -129,6 +130,11 @@ public:
         return nullptr;
     }
 
+    // Only used on ColumnDictionary
+    virtual void set_rowset_segment_id(std::pair<RowsetId, uint32_t> 
rowset_segment_id) {}
+
+    virtual std::pair<RowsetId, uint32_t> get_rowset_segment_id() const { 
return {}; }
+
     /// Returns number of values in column.
     virtual size_t size() const = 0;
 
diff --git a/be/src/vec/columns/column_dictionary.h 
b/be/src/vec/columns/column_dictionary.h
index b8976e77c5..681ed20852 100644
--- a/be/src/vec/columns/column_dictionary.h
+++ b/be/src/vec/columns/column_dictionary.h
@@ -262,6 +262,14 @@ public:
         return _dict.find_codes(values, selected);
     }
 
+    void set_rowset_segment_id(std::pair<RowsetId, uint32_t> 
rowset_segment_id) override {
+        _rowset_segment_id = rowset_segment_id;
+    }
+
+    std::pair<RowsetId, uint32_t> get_rowset_segment_id() const override {
+        return _rowset_segment_id;
+    }
+
     bool is_dict_sorted() const { return _dict_sorted; }
 
     bool is_dict_code_converted() const { return _dict_code_converted; }
@@ -451,6 +459,7 @@ private:
     Dictionary _dict;
     Container _codes;
     FieldType _type;
+    std::pair<RowsetId, uint32_t> _rowset_segment_id;
 };
 
 template class ColumnDictionary<int32_t>;
diff --git a/be/test/io/cache/remote_file_cache_test.cpp 
b/be/test/io/cache/remote_file_cache_test.cpp
index a88cf2fbf8..5de0a5f955 100644
--- a/be/test/io/cache/remote_file_cache_test.cpp
+++ b/be/test/io/cache/remote_file_cache_test.cpp
@@ -142,7 +142,7 @@ protected:
         EXPECT_EQ("", writer.min_encoded_key().to_string());
         EXPECT_EQ("", writer.max_encoded_key().to_string());
 
-        st = segment_v2::Segment::open(fs, path, "", 0, query_schema, res);
+        st = segment_v2::Segment::open(fs, path, "", 0, {}, query_schema, res);
         EXPECT_TRUE(st.ok());
         EXPECT_EQ(nrows, (*res)->num_rows());
     }
diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp 
b/be/test/olap/rowset/segment_v2/segment_test.cpp
index ac2d8febfd..e793d4acd3 100644
--- a/be/test/olap/rowset/segment_v2/segment_test.cpp
+++ b/be/test/olap/rowset/segment_v2/segment_test.cpp
@@ -175,7 +175,7 @@ protected:
             EXPECT_EQ("", writer.max_encoded_key().to_string());
         }
 
-        st = Segment::open(fs, path, "", 0, query_schema, res);
+        st = Segment::open(fs, path, "", 0, {}, query_schema, res);
         EXPECT_TRUE(st.ok());
         EXPECT_EQ(nrows, (*res)->num_rows());
     }
@@ -774,7 +774,7 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) {
 
     {
         std::shared_ptr<Segment> segment;
-        st = Segment::open(fs, fname, "", 0, tablet_schema, &segment);
+        st = Segment::open(fs, fname, "", 0, {}, tablet_schema, &segment);
         EXPECT_TRUE(st.ok());
         EXPECT_EQ(4096, segment->num_rows());
         Schema schema(tablet_schema);
diff --git a/be/test/tools/benchmark_tool.cpp b/be/test/tools/benchmark_tool.cpp
index b5c1280796..7b6a296e13 100644
--- a/be/test/tools/benchmark_tool.cpp
+++ b/be/test/tools/benchmark_tool.cpp
@@ -364,7 +364,7 @@ public:
         writer.finalize(&file_size, &index_size);
         file_writer->close();
 
-        Segment::open(fs, path, "", seg_id, &_tablet_schema, res);
+        Segment::open(fs, path, "", seg_id, {}, &_tablet_schema, res);
     }
 
     std::vector<std::vector<std::string>> generate_dataset(int rows_number) {


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to