This is an automated email from the ASF dual-hosted git repository. dataroaring pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 24a994eb9f [Feature-WIP](inverted) add inverted index writer api for be (#14207) 24a994eb9f is described below commit 24a994eb9fb9e9b9eee2b79ced4bea55b9a98d8d Author: airborne12 <airborn...@gmail.com> AuthorDate: Mon Dec 26 15:02:12 2022 +0800 [Feature-WIP](inverted) add inverted index writer api for be (#14207) --- be/src/io/fs/file_writer.h | 3 ++ be/src/io/fs/local_file_system.cpp | 2 +- be/src/io/fs/local_file_writer.cpp | 6 +++ be/src/io/fs/local_file_writer.h | 7 +++ be/src/io/fs/s3_file_writer.h | 6 +++ be/src/olap/rowset/rowset_writer_context.h | 1 + be/src/olap/rowset/segment_v2/column_writer.cpp | 49 +++++++++++++++++- be/src/olap/rowset/segment_v2/column_writer.h | 11 ++++ .../olap/rowset/segment_v2/inverted_index_writer.h | 58 ++++++++++++++++++++++ be/src/olap/rowset/segment_v2/segment_writer.cpp | 23 +++++++++ be/src/olap/rowset/segment_v2/segment_writer.h | 4 ++ 11 files changed, 168 insertions(+), 2 deletions(-) diff --git a/be/src/io/fs/file_writer.h b/be/src/io/fs/file_writer.h index 804a9f329d..7c96560397 100644 --- a/be/src/io/fs/file_writer.h +++ b/be/src/io/fs/file_writer.h @@ -26,6 +26,7 @@ namespace doris { namespace io { +class FileSystem; class FileWriter { public: @@ -52,6 +53,8 @@ public: virtual size_t bytes_appended() const = 0; + virtual FileSystem* fs() const = 0; + const Path& path() const { return _path; } protected: diff --git a/be/src/io/fs/local_file_system.cpp b/be/src/io/fs/local_file_system.cpp index d9e43c21a8..c1b062601c 100644 --- a/be/src/io/fs/local_file_system.cpp +++ b/be/src/io/fs/local_file_system.cpp @@ -42,7 +42,7 @@ Status LocalFileSystem::create_file(const Path& path, FileWriterPtr* writer) { if (-1 == fd) { return Status::IOError("cannot open {}: {}", fs_path.native(), std::strerror(errno)); } - *writer = std::make_unique<LocalFileWriter>(std::move(fs_path), fd); + *writer = std::make_unique<LocalFileWriter>(std::move(fs_path), fd, this); return Status::OK(); } diff --git a/be/src/io/fs/local_file_writer.cpp b/be/src/io/fs/local_file_writer.cpp index 2b28b501f7..bbc208916f 100644 --- a/be/src/io/fs/local_file_writer.cpp +++ b/be/src/io/fs/local_file_writer.cpp @@ -56,6 +56,12 @@ Status sync_dir(const io::Path& dirname) { namespace io { +LocalFileWriter::LocalFileWriter(Path path, int fd, FileSystem* fs) + : FileWriter(std::move(path)), _fd(fd), _fs(fs) { + DorisMetrics::instance()->local_file_open_writing->increment(1); + DorisMetrics::instance()->local_file_writer_total->increment(1); +} + LocalFileWriter::LocalFileWriter(Path path, int fd) : FileWriter(std::move(path)), _fd(fd) { DorisMetrics::instance()->local_file_open_writing->increment(1); DorisMetrics::instance()->local_file_writer_total->increment(1); diff --git a/be/src/io/fs/local_file_writer.h b/be/src/io/fs/local_file_writer.h index 8ea548ffcc..ecac1a6f76 100644 --- a/be/src/io/fs/local_file_writer.h +++ b/be/src/io/fs/local_file_writer.h @@ -19,6 +19,7 @@ #include <cstddef> +#include "io/fs/file_system.h" #include "io/fs/file_writer.h" namespace doris { @@ -26,7 +27,10 @@ namespace io { class LocalFileWriter final : public FileWriter { public: + LocalFileWriter(Path path, int fd, FileSystem* fs); + LocalFileWriter(Path path, int fd); + ~LocalFileWriter() override; Status close() override; @@ -43,11 +47,14 @@ public: size_t bytes_appended() const override { return _bytes_appended; } + FileSystem* fs() const override { return _fs; } + private: Status _close(bool sync); private: int _fd; // owned + FileSystem* _fs; size_t _bytes_appended = 0; bool _dirty = false; diff --git a/be/src/io/fs/s3_file_writer.h b/be/src/io/fs/s3_file_writer.h index d3abc19ba8..8c917c2d26 100644 --- a/be/src/io/fs/s3_file_writer.h +++ b/be/src/io/fs/s3_file_writer.h @@ -21,6 +21,7 @@ #include <list> #include "io/fs/file_writer.h" +#include "io/fs/s3_file_system.h" #include "util/s3_util.h" namespace Aws::S3 { @@ -52,6 +53,11 @@ public: size_t bytes_appended() const override { return _bytes_appended; } + FileSystem* fs() const override { return _fs; } + +private: + S3FileSystem* _fs; + private: Status _close(); diff --git a/be/src/olap/rowset/rowset_writer_context.h b/be/src/olap/rowset/rowset_writer_context.h index 8fef7bb16a..53ef8e8d1b 100644 --- a/be/src/olap/rowset/rowset_writer_context.h +++ b/be/src/olap/rowset/rowset_writer_context.h @@ -99,6 +99,7 @@ struct RowsetWriterContext { int64_t oldest_write_timestamp; int64_t newest_write_timestamp; bool enable_unique_key_merge_on_write = false; + std::set<int32_t> skip_inverted_index; }; } // namespace doris diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 2f1696a435..146a29443c 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -22,10 +22,12 @@ #include "common/logging.h" #include "env/env.h" #include "gutil/strings/substitute.h" +#include "io/fs/file_writer.h" #include "olap/rowset/segment_v2/bitmap_index_writer.h" #include "olap/rowset/segment_v2/bloom_filter.h" #include "olap/rowset/segment_v2/bloom_filter_index_writer.h" #include "olap/rowset/segment_v2/encoding_info.h" +#include "olap/rowset/segment_v2/inverted_index_writer.h" #include "olap/rowset/segment_v2/options.h" #include "olap/rowset/segment_v2/ordinal_page_index.h" #include "olap/rowset/segment_v2/page_builder.h" @@ -96,6 +98,7 @@ Status ColumnWriter::create(const ColumnWriterOptions& opts, const TabletColumn* item_options.need_zone_map = false; item_options.need_bloom_filter = item_column.is_bf_column(); item_options.need_bitmap_index = item_column.has_bitmap_index(); + item_options.inverted_index = nullptr; if (item_column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { if (item_options.need_bloom_filter) { return Status::NotSupported("Do not support bloom filter for array type"); @@ -296,6 +299,13 @@ Status ScalarColumnWriter::init() { RETURN_IF_ERROR( BitmapIndexWriter::create(get_field()->type_info(), &_bitmap_index_builder)); } + if (_opts.inverted_index) { + RETURN_IF_ERROR(InvertedIndexColumnWriter::create( + get_field(), &_inverted_index_builder, _opts.meta->unique_id(), + _file_writer->path().filename().native(), + _file_writer->path().parent_path().native(), _opts.inverted_index, + _file_writer->fs())); + } if (_opts.need_bloom_filter) { RETURN_IF_ERROR(BloomFilterIndexWriter::create( BloomFilterOptions(), get_field()->type_info(), &_bloom_filter_index_builder)); @@ -312,6 +322,9 @@ Status ScalarColumnWriter::append_nulls(size_t num_rows) { if (_opts.need_bitmap_index) { _bitmap_index_builder->add_nulls(num_rows); } + if (_opts.inverted_index) { + _inverted_index_builder->add_nulls(num_rows); + } if (_opts.need_bloom_filter) { _bloom_filter_index_builder->add_nulls(num_rows); } @@ -344,6 +357,9 @@ Status ScalarColumnWriter::append_data_in_current_page(const uint8_t* data, size if (_opts.need_bitmap_index) { _bitmap_index_builder->add_values(data, *num_written); } + if (_opts.inverted_index) { + _inverted_index_builder->add_values(get_field()->name(), data, *num_written); + } if (_opts.need_bloom_filter) { _bloom_filter_index_builder->add_values(data, *num_written); } @@ -432,6 +448,13 @@ Status ScalarColumnWriter::write_bitmap_index() { return Status::OK(); } +Status ScalarColumnWriter::write_inverted_index() { + if (_opts.inverted_index) { + return _inverted_index_builder->finish(); + } + return Status::OK(); +} + Status ScalarColumnWriter::write_bloom_filter_index() { if (_opts.need_bloom_filter) { return _bloom_filter_index_builder->finish(_file_writer, _opts.meta->add_indexes()); @@ -532,7 +555,16 @@ Status ArrayColumnWriter::init() { } RETURN_IF_ERROR(_item_writer->init()); _offset_writer->register_flush_page_callback(this); - + if (_opts.inverted_index) { + auto writer = dynamic_cast<ScalarColumnWriter*>(_item_writer.get()); + if (writer != nullptr) { + RETURN_IF_ERROR(InvertedIndexColumnWriter::create( + get_field(), &_inverted_index_builder, _opts.meta->unique_id(), + writer->_file_writer->path().filename().native(), + writer->_file_writer->path().parent_path().native(), _opts.inverted_index, + writer->_file_writer->fs())); + } + } return Status::OK(); } @@ -541,6 +573,13 @@ Status ArrayColumnWriter::put_extra_info_in_page(DataPageFooterPB* footer) { return Status::OK(); } +Status ArrayColumnWriter::write_inverted_index() { + if (_opts.inverted_index) { + return _inverted_index_builder->finish(); + } + return Status::OK(); +} + // Now we can only write data one by one. Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { size_t remaining = num_rows; @@ -567,6 +606,14 @@ Status ArrayColumnWriter::append_data(const uint8_t** ptr, size_t num_rows) { RETURN_IF_ERROR(_item_writer->append_data(reinterpret_cast<const uint8_t**>(&data), col_cursor->length())); } + if (_opts.inverted_index) { + auto writer = dynamic_cast<ScalarColumnWriter*>(_item_writer.get()); + if (writer != nullptr) { + //NOTE: use array field name as index field, but item_writer size should be used when moving item_data_ptr + _inverted_index_builder->add_array_values(_item_writer->get_field()->size(), + col_cursor, 1); + } + } } remaining -= num_written; col_cursor += num_written; diff --git a/be/src/olap/rowset/segment_v2/column_writer.h b/be/src/olap/rowset/segment_v2/column_writer.h index 45829b37d9..748b8d725e 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.h +++ b/be/src/olap/rowset/segment_v2/column_writer.h @@ -21,6 +21,7 @@ #include "common/status.h" // for Status #include "gen_cpp/segment_v2.pb.h" // for EncodingTypePB +#include "olap/inverted_index_parser.h" #include "olap/rowset/segment_v2/common.h" #include "olap/rowset/segment_v2/page_pointer.h" // for PagePointer #include "olap/tablet_schema.h" // for TabletColumn @@ -50,6 +51,8 @@ struct ColumnWriterOptions { bool need_zone_map = false; bool need_bitmap_index = false; bool need_bloom_filter = false; + std::vector<const TabletIndex*> indexes; + const TabletIndex* inverted_index = nullptr; std::string to_string() const { std::stringstream ss; ss << std::boolalpha << "meta=" << meta->DebugString() @@ -62,6 +65,7 @@ struct ColumnWriterOptions { }; class BitmapIndexWriter; +class InvertedIndexColumnWriter; class EncodingInfo; class NullBitmapBuilder; class OrdinalIndexWriter; @@ -126,6 +130,8 @@ public: virtual Status write_bitmap_index() = 0; + virtual Status write_inverted_index() = 0; + virtual Status write_bloom_filter_index() = 0; virtual ordinal_t get_next_rowid() const = 0; @@ -174,6 +180,7 @@ public: Status write_ordinal_index() override; Status write_zone_map() override; Status write_bitmap_index() override; + Status write_inverted_index() override; Status write_bloom_filter_index() override; ordinal_t get_next_rowid() const override { return _next_rowid; } @@ -186,6 +193,7 @@ public: Status append_data_in_current_page(const uint8_t** ptr, size_t* num_written); Status append_data_in_current_page(const uint8_t* ptr, size_t* num_written); + friend class ArrayColumnWriter; private: std::unique_ptr<PageBuilder> _page_builder; @@ -247,6 +255,7 @@ private: std::unique_ptr<OrdinalIndexWriter> _ordinal_index_builder; std::unique_ptr<ZoneMapIndexWriter> _zone_map_index_builder; std::unique_ptr<BitmapIndexWriter> _bitmap_index_builder; + std::unique_ptr<InvertedIndexColumnWriter> _inverted_index_builder; std::unique_ptr<BloomFilterIndexWriter> _bloom_filter_index_builder; // call before flush data page. @@ -286,6 +295,7 @@ public: } return Status::OK(); } + Status write_inverted_index() override; Status write_bloom_filter_index() override { if (_opts.need_bloom_filter) { return Status::NotSupported("array not support bloom filter index"); @@ -303,6 +313,7 @@ private: std::unique_ptr<ScalarColumnWriter> _offset_writer; std::unique_ptr<ScalarColumnWriter> _null_writer; std::unique_ptr<ColumnWriter> _item_writer; + std::unique_ptr<InvertedIndexColumnWriter> _inverted_index_builder; ColumnWriterOptions _opts; }; diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h new file mode 100644 index 0000000000..2554c42c79 --- /dev/null +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -0,0 +1,58 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "common/status.h" +#include "olap/inverted_index_parser.h" +#include "olap/olap_common.h" +#include "olap/tablet_schema.h" + +namespace doris { +class CollectionValue; + +namespace segment_v2 { + +class InvertedIndexColumnWriter { +public: + static Status create(const Field* field, std::unique_ptr<InvertedIndexColumnWriter>* res, + uint32_t uuid, const std::string& segment_file_name, + const std::string& dir, const TabletIndex* inverted_index, + io::FileSystem* fs) { + return Status::OK(); + } + virtual Status init() = 0; + + InvertedIndexColumnWriter() = default; + virtual ~InvertedIndexColumnWriter() = default; + + virtual Status add_values(const std::string name, const void* values, size_t count) = 0; + virtual Status add_array_values(size_t field_size, const CollectionValue* values, + size_t count) = 0; + + virtual Status add_nulls(uint32_t count) = 0; + + virtual Status finish() = 0; + + virtual uint64_t size() const = 0; + +private: + DISALLOW_COPY_AND_ASSIGN(InvertedIndexColumnWriter); +}; + +} // namespace segment_v2 +} // namespace doris \ No newline at end of file diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 1efe66f97a..9776761aa6 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -24,6 +24,7 @@ #include "olap/primary_key_index.h" #include "olap/row.h" // ContiguousRow #include "olap/row_cursor.h" // RowCursor +#include "olap/rowset/rowset_writer_context.h" // RowsetWriterContext #include "olap/rowset/segment_v2/column_writer.h" // ColumnWriter #include "olap/rowset/segment_v2/page_io.h" #include "olap/schema.h" @@ -112,6 +113,20 @@ Status SegmentWriter::init(const std::vector<uint32_t>& col_ids, bool has_key) { opts.need_zone_map = column.is_key() || _tablet_schema->keys_type() != KeysType::AGG_KEYS; opts.need_bloom_filter = column.is_bf_column(); opts.need_bitmap_index = column.has_bitmap_index(); + bool skip_inverted_index = false; + if (_opts.rowset_ctx != nullptr) { + skip_inverted_index = + _opts.rowset_ctx->skip_inverted_index.count(column.unique_id()) > 0; + } + // indexes for this column + opts.indexes = _tablet_schema->get_indexes_for_column(column.unique_id()); + for (auto index : opts.indexes) { + if (!skip_inverted_index && index && index->index_type() == IndexType::INVERTED) { + opts.inverted_index = index; + // TODO support multiple inverted index + break; + } + } if (column.type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { opts.need_zone_map = false; if (opts.need_bloom_filter) { @@ -362,6 +377,7 @@ Status SegmentWriter::finalize_columns(uint64_t* index_size) { RETURN_IF_ERROR(_write_ordinal_index()); RETURN_IF_ERROR(_write_zone_map()); RETURN_IF_ERROR(_write_bitmap_index()); + RETURN_IF_ERROR(_write_inverted_index()); RETURN_IF_ERROR(_write_bloom_filter_index()); *index_size = _file_writer->bytes_appended() - index_offset; @@ -437,6 +453,13 @@ Status SegmentWriter::_write_bitmap_index() { return Status::OK(); } +Status SegmentWriter::_write_inverted_index() { + for (auto& column_writer : _column_writers) { + RETURN_IF_ERROR(column_writer->write_inverted_index()); + } + return Status::OK(); +} + Status SegmentWriter::_write_bloom_filter_index() { for (auto& column_writer : _column_writers) { RETURN_IF_ERROR(column_writer->write_bloom_filter_index()); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index e49b9f03fd..6e18a0735b 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -43,6 +43,7 @@ class TabletColumn; class ShortKeyIndexBuilder; class PrimaryKeyIndexBuilder; class KeyCoder; +struct RowsetWriterContext; namespace io { class FileWriter; @@ -58,6 +59,8 @@ extern const uint32_t k_segment_magic_length; struct SegmentWriterOptions { uint32_t num_rows_per_block = 1024; bool enable_unique_key_merge_on_write = false; + + RowsetWriterContext* rowset_ctx = nullptr; }; class SegmentWriter { @@ -105,6 +108,7 @@ private: Status _write_ordinal_index(); Status _write_zone_map(); Status _write_bitmap_index(); + Status _write_inverted_index(); Status _write_bloom_filter_index(); Status _write_short_key_index(); Status _write_primary_key_index(); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org