This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 8d5b621021a [improvement](inverted index) Change inverted index field_name from column_name to id in format v2 #36470 (#36516) 8d5b621021a is described below commit 8d5b621021a05591314ded58882a762f50642c5d Author: qiye <jianliang5...@gmail.com> AuthorDate: Wed Jun 19 17:29:26 2024 +0800 [improvement](inverted index) Change inverted index field_name from column_name to id in format v2 #36470 (#36516) pick from master #36470 --- be/src/olap/accept_null_predicate.h | 2 +- be/src/olap/column_predicate.h | 2 +- be/src/olap/comparison_predicate.h | 2 +- be/src/olap/field.h | 8 ++++++++ be/src/olap/in_list_predicate.h | 2 +- be/src/olap/match_predicate.cpp | 2 +- be/src/olap/match_predicate.h | 2 +- be/src/olap/null_predicate.cpp | 2 +- be/src/olap/null_predicate.h | 2 +- .../rowset/segment_v2/inverted_index_file_writer.h | 1 + .../rowset/segment_v2/inverted_index_writer.cpp | 15 ++++++++++++++- be/src/olap/rowset/segment_v2/segment_iterator.cpp | 22 +++++++++++++++++++++- be/src/olap/rowset/segment_v2/segment_iterator.h | 2 +- be/src/olap/shared_predicate.h | 2 +- be/src/vec/core/columns_with_type_and_name.h | 12 ++++++++++-- 15 files changed, 64 insertions(+), 14 deletions(-) diff --git a/be/src/olap/accept_null_predicate.h b/be/src/olap/accept_null_predicate.h index 1f6f9f6ec35..81e0c57b1e8 100644 --- a/be/src/olap/accept_null_predicate.h +++ b/be/src/olap/accept_null_predicate.h @@ -51,7 +51,7 @@ public: return _nested->evaluate(iterator, num_rows, roaring); } - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override { return _nested->evaluate(name_with_type, iterator, num_rows, bitmap); diff --git a/be/src/olap/column_predicate.h b/be/src/olap/column_predicate.h index b6b419f8ccf..d5b5abe1501 100644 --- a/be/src/olap/column_predicate.h +++ b/be/src/olap/column_predicate.h @@ -176,7 +176,7 @@ public: roaring::Roaring* roaring) const = 0; //evaluate predicate on inverted - virtual Status evaluate(const vectorized::NameAndTypePair& name_with_type, + virtual Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const { return Status::NotSupported( diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 6a5f27bd326..2e0c4db4ba0 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -67,7 +67,7 @@ public: bitmap); } - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override { if (iterator == nullptr) { diff --git a/be/src/olap/field.h b/be/src/olap/field.h index 6a2d407ff6c..3e26a453d81 100644 --- a/be/src/olap/field.h +++ b/be/src/olap/field.h @@ -49,6 +49,8 @@ public: _index_size(column.index_length()), _is_nullable(column.is_nullable()), _unique_id(column.unique_id()), + _parent_unique_id(column.parent_unique_id()), + _is_extracted_column(column.is_extracted_column()), _path(column.path_info_ptr()) {} virtual ~Field() = default; @@ -58,6 +60,8 @@ public: size_t field_size() const { return size() + 1; } size_t index_size() const { return _index_size; } int32_t unique_id() const { return _unique_id; } + int32_t parent_unique_id() const { return _parent_unique_id; } + bool is_extracted_column() const { return _is_extracted_column; } const std::string& name() const { return _name; } const vectorized::PathInDataPtr& path() const { return _path; } @@ -241,6 +245,8 @@ protected: other->_precision = this->_precision; other->_scale = this->_scale; other->_unique_id = this->_unique_id; + other->_parent_unique_id = this->_parent_unique_id; + other->_is_extracted_column = this->_is_extracted_column; for (const auto& f : _sub_fields) { Field* item = f->clone(); other->add_sub_field(std::unique_ptr<Field>(item)); @@ -258,6 +264,8 @@ private: int32_t _precision; int32_t _scale; int32_t _unique_id; + int32_t _parent_unique_id; + bool _is_extracted_column = false; vectorized::PathInDataPtr _path; }; diff --git a/be/src/olap/in_list_predicate.h b/be/src/olap/in_list_predicate.h index 4a1a10f898f..dfb3d4bf5e6 100644 --- a/be/src/olap/in_list_predicate.h +++ b/be/src/olap/in_list_predicate.h @@ -180,7 +180,7 @@ public: return Status::OK(); } - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* result) const override { if (iterator == nullptr) { diff --git a/be/src/olap/match_predicate.cpp b/be/src/olap/match_predicate.cpp index 3751716df90..5bdfdfd9cac 100644 --- a/be/src/olap/match_predicate.cpp +++ b/be/src/olap/match_predicate.cpp @@ -45,7 +45,7 @@ PredicateType MatchPredicate::type() const { return PredicateType::MATCH; } -Status MatchPredicate::evaluate(const vectorized::NameAndTypePair& name_with_type, +Status MatchPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const { if (iterator == nullptr) { diff --git a/be/src/olap/match_predicate.h b/be/src/olap/match_predicate.h index 862bc4a0f59..17d8e76ac88 100644 --- a/be/src/olap/match_predicate.h +++ b/be/src/olap/match_predicate.h @@ -60,7 +60,7 @@ public: } //evaluate predicate on inverted - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override; diff --git a/be/src/olap/null_predicate.cpp b/be/src/olap/null_predicate.cpp index 0b184707d8f..06ab85324ef 100644 --- a/be/src/olap/null_predicate.cpp +++ b/be/src/olap/null_predicate.cpp @@ -53,7 +53,7 @@ Status NullPredicate::evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, return Status::OK(); } -Status NullPredicate::evaluate(const vectorized::NameAndTypePair& name_with_type, +Status NullPredicate::evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const { if (iterator->has_null()) { diff --git a/be/src/olap/null_predicate.h b/be/src/olap/null_predicate.h index ccca5c51027..59480264b46 100644 --- a/be/src/olap/null_predicate.h +++ b/be/src/olap/null_predicate.h @@ -52,7 +52,7 @@ public: Status evaluate(BitmapIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* roaring) const override; - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h index 7e819b0dd75..03306110a28 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_file_writer.h @@ -83,6 +83,7 @@ public: std::string get_index_file_path(const TabletIndex* index_meta) const; size_t get_index_file_size() const { return _file_size; } const io::FileSystemSPtr& get_fs() const { return _fs; } + InvertedIndexStorageFormatPB get_storage_format() const { return _storage_format; } private: InvertedIndexDirectoryMap _indices_dirs; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index c838621f92d..e2815dfa108 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -26,6 +26,7 @@ #include <memory> #include <ostream> #include <roaring/roaring.hh> +#include <string> #include <vector> #ifdef __clang__ @@ -657,7 +658,19 @@ Status InvertedIndexColumnWriter::create(const Field* field, const TabletIndex* index_meta) { const auto* typeinfo = field->type_info(); FieldType type = typeinfo->type(); - std::string field_name = field->name(); + std::string field_name; + auto storage_format = index_file_writer->get_storage_format(); + if (storage_format == InvertedIndexStorageFormatPB::V1) { + field_name = field->name(); + } else { + if (field->is_extracted_column()) { + // variant sub col + // field_name format: parent_unique_id.sub_col_name + field_name = std::to_string(field->parent_unique_id()) + "." + field->name(); + } else { + field_name = std::to_string(field->unique_id()); + } + } bool single_field = true; if (type == FieldType::OLAP_FIELD_TYPE_ARRAY) { const auto* array_typeinfo = dynamic_cast<const ArrayTypeInfo*>(typeinfo); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index c3097440e08..c95f0610562 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -327,6 +327,7 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { } _storage_name_and_type.resize(_schema->columns().size()); + auto storage_format = _opts.tablet_schema->get_inverted_index_storage_format(); for (int i = 0; i < _schema->columns().size(); ++i) { const Field* col = _schema->column(i); if (col) { @@ -336,7 +337,26 @@ Status SegmentIterator::_init_impl(const StorageReadOptions& opts) { if (storage_type == nullptr) { storage_type = vectorized::DataTypeFactory::instance().create_data_type(*col); } - _storage_name_and_type[i] = std::make_pair(col->name(), storage_type); + // Currently, when writing a lucene index, the field of the document is column_name, and the column name is + // bound to the index field. Since version 1.2, the data file storage has been changed from column_name to + // column_unique_id, allowing the column name to be changed. Due to current limitations, previous inverted + // index data cannot be used after Doris changes the column name. Column names also support Unicode + // characters, which may cause other problems with indexing in non-ASCII characters. + // After consideration, it was decided to change the field name from column_name to column_unique_id in + // format V2, while format V1 continues to use column_name. + std::string field_name; + if (storage_format == InvertedIndexStorageFormatPB::V1) { + field_name = col->name(); + } else { + if (col->is_extracted_column()) { + // variant sub col + // field_name format: parent_unique_id.sub_col_name + field_name = std::to_string(col->parent_unique_id()) + "." + col->name(); + } else { + field_name = std::to_string(col->unique_id()); + } + } + _storage_name_and_type[i] = std::make_pair(field_name, storage_type); } } return Status::OK(); diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.h b/be/src/olap/rowset/segment_v2/segment_iterator.h index b03ed04603a..a0ef11ece48 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.h +++ b/be/src/olap/rowset/segment_v2/segment_iterator.h @@ -392,7 +392,7 @@ private: // read schema from scanner SchemaSPtr _schema; // storage type schema related to _schema, since column in segment may be different with type in _schema - std::vector<vectorized::NameAndTypePair> _storage_name_and_type; + std::vector<vectorized::IndexFieldNameAndTypePair> _storage_name_and_type; // vector idx -> column iterarator std::vector<std::unique_ptr<ColumnIterator>> _column_iterators; std::vector<std::unique_ptr<BitmapIndexIterator>> _bitmap_index_iterators; diff --git a/be/src/olap/shared_predicate.h b/be/src/olap/shared_predicate.h index 83c4ae62515..41b18e99ba4 100644 --- a/be/src/olap/shared_predicate.h +++ b/be/src/olap/shared_predicate.h @@ -61,7 +61,7 @@ public: return _nested->evaluate(iterator, num_rows, roaring); } - Status evaluate(const vectorized::NameAndTypePair& name_with_type, + Status evaluate(const vectorized::IndexFieldNameAndTypePair& name_with_type, InvertedIndexIterator* iterator, uint32_t num_rows, roaring::Roaring* bitmap) const override { std::shared_lock<std::shared_mutex> lock(_mtx); diff --git a/be/src/vec/core/columns_with_type_and_name.h b/be/src/vec/core/columns_with_type_and_name.h index 82eae3158ab..f3a329150e4 100644 --- a/be/src/vec/core/columns_with_type_and_name.h +++ b/be/src/vec/core/columns_with_type_and_name.h @@ -30,6 +30,14 @@ namespace doris::vectorized { using ColumnsWithTypeAndName = std::vector<ColumnWithTypeAndName>; -using NameAndTypePair = std::pair<std::string, DataTypePtr>; -using NameAndTypePairs = std::vector<NameAndTypePair>; +// only used in inverted index +// <field_name, storage_type> +// field_name is the name of inverted index document's filed +// 1. for inverted_index_storage_format_v1, field_name is the `column_name` in Doris +// 2. for inverted_index_storage_format_v2 +// 2.1 for normal column, field_name is the `column_unique_id` in Doris +// 2.2 for variant column, field_name is the `parent_column_unique_id.sub_column_name` in Doris +// storage_type is the data type in Doris +using IndexFieldNameAndTypePair = std::pair<std::string, DataTypePtr>; +using NameAndTypePairs = std::vector<std::pair<std::string, DataTypePtr>>; } // namespace doris::vectorized --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org