This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new 8d5b621021a [improvement](inverted index) Change inverted index 
field_name from column_name to id in format v2 #36470 (#36516)
8d5b621021a is described below

commit 8d5b621021a05591314ded58882a762f50642c5d
Author: qiye <jianliang5...@gmail.com>
AuthorDate: Wed Jun 19 17:29:26 2024 +0800

    [improvement](inverted index) Change inverted index field_name from 
column_name to id in format v2 #36470 (#36516)
    
    pick from master #36470
---
 be/src/olap/accept_null_predicate.h                |  2 +-
 be/src/olap/column_predicate.h                     |  2 +-
 be/src/olap/comparison_predicate.h                 |  2 +-
 be/src/olap/field.h                                |  8 ++++++++
 be/src/olap/in_list_predicate.h                    |  2 +-
 be/src/olap/match_predicate.cpp                    |  2 +-
 be/src/olap/match_predicate.h                      |  2 +-
 be/src/olap/null_predicate.cpp                     |  2 +-
 be/src/olap/null_predicate.h                       |  2 +-
 .../rowset/segment_v2/inverted_index_file_writer.h |  1 +
 .../rowset/segment_v2/inverted_index_writer.cpp    | 15 ++++++++++++++-
 be/src/olap/rowset/segment_v2/segment_iterator.cpp | 22 +++++++++++++++++++++-
 be/src/olap/rowset/segment_v2/segment_iterator.h   |  2 +-
 be/src/olap/shared_predicate.h                     |  2 +-
 be/src/vec/core/columns_with_type_and_name.h       | 12 ++++++++++--
 15 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/be/src/olap/accept_null_predicate.h 
b/be/src/olap/accept_null_predicate.h
index 1f6f9f6ec35..81e0c57b1e8 100644
--- a/be/src/olap/accept_null_predicate.h
+++ b/be/src/olap/accept_null_predicate.h
@@ -51,7 +51,7 @@ public:
         return _nested->evaluate(iterator, num_rows, roaring);
     }
 
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override {
         return _nested->evaluate(name_with_type, iterator, num_rows, bitmap);
diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h
index b6b419f8ccf..d5b5abe1501 100644
--- a/be/src/olap/column_predicate.h
+++ b/be/src/olap/column_predicate.h
@@ -176,7 +176,7 @@ public:
                             roaring::Roaring* roaring) const = 0;
 
     //evaluate predicate on inverted
-    virtual Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    virtual Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                             InvertedIndexIterator* iterator, uint32_t num_rows,
                             roaring::Roaring* bitmap) const {
         return Status::NotSupported(
diff --git a/be/src/olap/comparison_predicate.h 
b/be/src/olap/comparison_predicate.h
index 6a5f27bd326..2e0c4db4ba0 100644
--- a/be/src/olap/comparison_predicate.h
+++ b/be/src/olap/comparison_predicate.h
@@ -67,7 +67,7 @@ public:
                                bitmap);
     }
 
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override {
         if (iterator == nullptr) {
diff --git a/be/src/olap/field.h b/be/src/olap/field.h
index 6a2d407ff6c..3e26a453d81 100644
--- a/be/src/olap/field.h
+++ b/be/src/olap/field.h
@@ -49,6 +49,8 @@ public:
               _index_size(column.index_length()),
               _is_nullable(column.is_nullable()),
               _unique_id(column.unique_id()),
+              _parent_unique_id(column.parent_unique_id()),
+              _is_extracted_column(column.is_extracted_column()),
               _path(column.path_info_ptr()) {}
 
     virtual ~Field() = default;
@@ -58,6 +60,8 @@ public:
     size_t field_size() const { return size() + 1; }
     size_t index_size() const { return _index_size; }
     int32_t unique_id() const { return _unique_id; }
+    int32_t parent_unique_id() const { return _parent_unique_id; }
+    bool is_extracted_column() const { return _is_extracted_column; }
     const std::string& name() const { return _name; }
     const vectorized::PathInDataPtr& path() const { return _path; }
 
@@ -241,6 +245,8 @@ protected:
         other->_precision = this->_precision;
         other->_scale = this->_scale;
         other->_unique_id = this->_unique_id;
+        other->_parent_unique_id = this->_parent_unique_id;
+        other->_is_extracted_column = this->_is_extracted_column;
         for (const auto& f : _sub_fields) {
             Field* item = f->clone();
             other->add_sub_field(std::unique_ptr<Field>(item));
@@ -258,6 +264,8 @@ private:
     int32_t _precision;
     int32_t _scale;
     int32_t _unique_id;
+    int32_t _parent_unique_id;
+    bool _is_extracted_column = false;
     vectorized::PathInDataPtr _path;
 };
 
diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h
index 4a1a10f898f..dfb3d4bf5e6 100644
--- a/be/src/olap/in_list_predicate.h
+++ b/be/src/olap/in_list_predicate.h
@@ -180,7 +180,7 @@ public:
         return Status::OK();
     }
 
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* result) const override {
         if (iterator == nullptr) {
diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp
index 3751716df90..5bdfdfd9cac 100644
--- a/be/src/olap/match_predicate.cpp
+++ b/be/src/olap/match_predicate.cpp
@@ -45,7 +45,7 @@ PredicateType MatchPredicate::type() const {
     return PredicateType::MATCH;
 }
 
-Status MatchPredicate::evaluate(const vectorized::NameAndTypePair& 
name_with_type,
+Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                                 InvertedIndexIterator* iterator, uint32_t 
num_rows,
                                 roaring::Roaring* bitmap) const {
     if (iterator == nullptr) {
diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h
index 862bc4a0f59..17d8e76ac88 100644
--- a/be/src/olap/match_predicate.h
+++ b/be/src/olap/match_predicate.h
@@ -60,7 +60,7 @@ public:
     }
 
     //evaluate predicate on inverted
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override;
 
diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp
index 0b184707d8f..06ab85324ef 100644
--- a/be/src/olap/null_predicate.cpp
+++ b/be/src/olap/null_predicate.cpp
@@ -53,7 +53,7 @@ Status NullPredicate::evaluate(BitmapIndexIterator* iterator, 
uint32_t num_rows,
     return Status::OK();
 }
 
-Status NullPredicate::evaluate(const vectorized::NameAndTypePair& 
name_with_type,
+Status NullPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                                InvertedIndexIterator* iterator, uint32_t 
num_rows,
                                roaring::Roaring* bitmap) const {
     if (iterator->has_null()) {
diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h
index ccca5c51027..59480264b46 100644
--- a/be/src/olap/null_predicate.h
+++ b/be/src/olap/null_predicate.h
@@ -52,7 +52,7 @@ public:
     Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* roaring) const override;
 
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override;
 
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h 
b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
index 7e819b0dd75..03306110a28 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
+++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h
@@ -83,6 +83,7 @@ public:
     std::string get_index_file_path(const TabletIndex* index_meta) const;
     size_t get_index_file_size() const { return _file_size; }
     const io::FileSystemSPtr& get_fs() const { return _fs; }
+    InvertedIndexStorageFormatPB get_storage_format() const { return 
_storage_format; }
 
 private:
     InvertedIndexDirectoryMap _indices_dirs;
diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp 
b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
index c838621f92d..e2815dfa108 100644
--- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
+++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp
@@ -26,6 +26,7 @@
 #include <memory>
 #include <ostream>
 #include <roaring/roaring.hh>
+#include <string>
 #include <vector>
 
 #ifdef __clang__
@@ -657,7 +658,19 @@ Status InvertedIndexColumnWriter::create(const Field* 
field,
                                          const TabletIndex* index_meta) {
     const auto* typeinfo = field->type_info();
     FieldType type = typeinfo->type();
-    std::string field_name = field->name();
+    std::string field_name;
+    auto storage_format = index_file_writer->get_storage_format();
+    if (storage_format == InvertedIndexStorageFormatPB::V1) {
+        field_name = field->name();
+    } else {
+        if (field->is_extracted_column()) {
+            // variant sub col
+            // field_name format: parent_unique_id.sub_col_name
+            field_name = std::to_string(field->parent_unique_id()) + "." + 
field->name();
+        } else {
+            field_name = std::to_string(field->unique_id());
+        }
+    }
     bool single_field = true;
     if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) {
         const auto* array_typeinfo = dynamic_cast<const 
ArrayTypeInfo*>(typeinfo);
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp 
b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
index c3097440e08..c95f0610562 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp
@@ -327,6 +327,7 @@ Status SegmentIterator::_init_impl(const 
StorageReadOptions& opts) {
     }
 
     _storage_name_and_type.resize(_schema->columns().size());
+    auto storage_format = 
_opts.tablet_schema->get_inverted_index_storage_format();
     for (int i = 0; i < _schema->columns().size(); ++i) {
         const Field* col = _schema->column(i);
         if (col) {
@@ -336,7 +337,26 @@ Status SegmentIterator::_init_impl(const 
StorageReadOptions& opts) {
             if (storage_type == nullptr) {
                 storage_type = 
vectorized::DataTypeFactory::instance().create_data_type(*col);
             }
-            _storage_name_and_type[i] = std::make_pair(col->name(), 
storage_type);
+            // Currently, when writing a lucene index, the field of the 
document is column_name, and the column name is
+            // bound to the index field. Since version 1.2, the data file 
storage has been changed from column_name to
+            // column_unique_id, allowing the column name to be changed. Due 
to current limitations, previous inverted
+            // index data cannot be used after Doris changes the column name. 
Column names also support Unicode
+            // characters, which may cause other problems with indexing in 
non-ASCII characters.
+            // After consideration, it was decided to change the field name 
from column_name to column_unique_id in
+            // format V2, while format V1 continues to use column_name.
+            std::string field_name;
+            if (storage_format == InvertedIndexStorageFormatPB::V1) {
+                field_name = col->name();
+            } else {
+                if (col->is_extracted_column()) {
+                    // variant sub col
+                    // field_name format: parent_unique_id.sub_col_name
+                    field_name = std::to_string(col->parent_unique_id()) + "." 
+ col->name();
+                } else {
+                    field_name = std::to_string(col->unique_id());
+                }
+            }
+            _storage_name_and_type[i] = std::make_pair(field_name, 
storage_type);
         }
     }
     return Status::OK();
diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h 
b/be/src/olap/rowset/segment_v2/segment_iterator.h
index b03ed04603a..a0ef11ece48 100644
--- a/be/src/olap/rowset/segment_v2/segment_iterator.h
+++ b/be/src/olap/rowset/segment_v2/segment_iterator.h
@@ -392,7 +392,7 @@ private:
     // read schema from scanner
     SchemaSPtr _schema;
     // storage type schema related to _schema, since column in segment may be 
different with type in _schema
-    std::vector<vectorized::NameAndTypePair> _storage_name_and_type;
+    std::vector<vectorized::IndexFieldNameAndTypePair> _storage_name_and_type;
     // vector idx -> column iterarator
     std::vector<std::unique_ptr<ColumnIterator>> _column_iterators;
     std::vector<std::unique_ptr<BitmapIndexIterator>> _bitmap_index_iterators;
diff --git a/be/src/olap/shared_predicate.h b/be/src/olap/shared_predicate.h
index 83c4ae62515..41b18e99ba4 100644
--- a/be/src/olap/shared_predicate.h
+++ b/be/src/olap/shared_predicate.h
@@ -61,7 +61,7 @@ public:
         return _nested->evaluate(iterator, num_rows, roaring);
     }
 
-    Status evaluate(const vectorized::NameAndTypePair& name_with_type,
+    Status evaluate(const vectorized::IndexFieldNameAndTypePair& 
name_with_type,
                     InvertedIndexIterator* iterator, uint32_t num_rows,
                     roaring::Roaring* bitmap) const override {
         std::shared_lock<std::shared_mutex> lock(_mtx);
diff --git a/be/src/vec/core/columns_with_type_and_name.h 
b/be/src/vec/core/columns_with_type_and_name.h
index 82eae3158ab..f3a329150e4 100644
--- a/be/src/vec/core/columns_with_type_and_name.h
+++ b/be/src/vec/core/columns_with_type_and_name.h
@@ -30,6 +30,14 @@
 namespace doris::vectorized {
 
 using ColumnsWithTypeAndName = std::vector<ColumnWithTypeAndName>;
-using NameAndTypePair = std::pair<std::string, DataTypePtr>;
-using NameAndTypePairs = std::vector<NameAndTypePair>;
+// only used in inverted index
+// <field_name, storage_type>
+// field_name is the name of inverted index document's filed
+//     1. for inverted_index_storage_format_v1, field_name is the 
`column_name` in Doris
+//     2. for inverted_index_storage_format_v2
+//         2.1 for normal column, field_name is the `column_unique_id` in Doris
+//         2.2 for variant column, field_name is the 
`parent_column_unique_id.sub_column_name` in Doris
+// storage_type is the data type in Doris
+using IndexFieldNameAndTypePair = std::pair<std::string, DataTypePtr>;
+using NameAndTypePairs = std::vector<std::pair<std::string, DataTypePtr>>;
 } // namespace doris::vectorized


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to