This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-3.0.4 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0.4 by this push: new 39f9074cec7 branch-3.0.4: [fix](array index) Correct null bitmap writing for inverted index #47846 (#48180) 39f9074cec7 is described below commit 39f9074cec769a10a0a93658b35d16ce59630e1e Author: airborne12 <jiang...@selectdb.com> AuthorDate: Fri Feb 21 15:37:56 2025 +0800 branch-3.0.4: [fix](array index) Correct null bitmap writing for inverted index #47846 (#48180) cherry pick from #47846 --- be/src/olap/rowset/segment_v2/column_writer.cpp | 10 +- .../rowset/segment_v2/inverted_index_writer.cpp | 52 +- .../olap/rowset/segment_v2/inverted_index_writer.h | 2 +- be/src/olap/task/index_builder.cpp | 41 +- .../segment_v2/inverted_index_array_test.cpp | 879 ++++++++++++++++++++- 5 files changed, 905 insertions(+), 79 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index 2637017b78d..f506cb24fce 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -469,7 +469,9 @@ Status ScalarColumnWriter::init() { return Status::OK(); } Status add_nulls(uint32_t count) override { return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { return Status::OK(); } + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + return Status::OK(); + } Status finish() override { return Status::OK(); } int64_t size() const override { return 0; } void close_on_error() override {} @@ -951,11 +953,7 @@ Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t RETURN_IF_ERROR(append_data(ptr, num_rows)); if (is_nullable()) { if (_opts.need_inverted_index) { - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map[row_id] == 1) { - RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(row_id)); - } - } + RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(null_map, num_rows)); } RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index d85511722ec..093d460ae43 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -303,8 +303,26 @@ public: return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { - _null_bitmap.add(row_id); + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + DCHECK(_rid >= num_rows); + if (num_rows == 0 || null_map == nullptr) { + return Status::OK(); + } + std::vector<uint32_t> null_indices; + null_indices.reserve(num_rows / 8); + + // because _rid is the row id in block, not segment, and we add data before we add nulls, + // so we need to subtract num_rows to get the row id in segment + for (size_t i = 0; i < num_rows; i++) { + if (null_map[i] == 1) { + null_indices.push_back(_rid - num_rows + static_cast<uint32_t>(i)); + } + } + + if (!null_indices.empty()) { + _null_bitmap.addMany(null_indices.size(), null_indices.data()); + } + return Status::OK(); } @@ -378,8 +396,9 @@ public: return Status::OK(); } - Status add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, - const uint8_t* offsets_ptr, size_t count) override { + Status add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, const uint8_t* offsets_ptr, + size_t count) override { DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_count_is_zero", { count = 0; }) if (count == 0) { @@ -404,7 +423,7 @@ public: lucene::document::Field* new_field = nullptr; CL_NS(analysis)::TokenStream* ts = nullptr; for (auto j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } auto* v = (Slice*)((const uint8_t*)value_ptr + j * field_size); @@ -500,7 +519,7 @@ public: for (int i = 0; i < count; ++i) { auto array_elem_size = offsets[i + 1] - offsets[i]; for (size_t j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } const CppType* p = &reinterpret_cast<const CppType*>(value_ptr)[j]; @@ -520,7 +539,8 @@ public: DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_field_is_nullptr", { _field = nullptr; }) DBUG_EXECUTE_IF( - "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_nullptr", + "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_" + "nullptr", { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; @@ -582,9 +602,10 @@ public: std::string new_value; size_t value_length = sizeof(CppType); - DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_error", { - _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); - }); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_" + "error", + { _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); }); _value_key_coder->full_encode_ascending(&value, &new_value); _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); @@ -643,8 +664,8 @@ public: _bkd_writer->finish(data_out.get(), index_out.get()), int(field_type)); } else { - LOG(WARNING) - << "Inverted index writer create output error occurred: nullptr"; + LOG(WARNING) << "Inverted index writer create output error " + "occurred: nullptr"; _CLTHROWA(CL_ERR_IO, "Create output error with nullptr"); } } else if constexpr (field_is_slice_type(field_type)) { @@ -653,9 +674,12 @@ public: InvertedIndexDescriptor::get_temporary_null_bitmap_file_name())); write_null_bitmap(null_bitmap_out.get()); DBUG_EXECUTE_IF( - "InvertedIndexWriter._throw_clucene_error_in_fulltext_writer_close", { + "InvertedIndexWriter._throw_clucene_error_in_fulltext_" + "writer_close", + { _CLTHROWA(CL_ERR_IO, - "debug point: test throw error in fulltext index writer"); + "debug point: test throw error in fulltext " + "index writer"); }); } } catch (CLuceneError& e) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index da90752db09..a8f719ee126 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -64,7 +64,7 @@ public: size_t count) = 0; virtual Status add_nulls(uint32_t count) = 0; - virtual Status add_array_nulls(uint32_t row_id) = 0; + virtual Status add_array_nulls(const uint8_t* null_map, size_t num_rows) = 0; virtual Status finish() = 0; diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index bc677ea6f5c..84f2345bb83 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -589,9 +589,9 @@ Status IndexBuilder::_write_inverted_index_data(TabletSchemaSPtr tablet_schema, return converted_result.first; } const auto* ptr = (const uint8_t*)converted_result.second->get_data(); - if (converted_result.second->get_nullmap()) { - RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), - converted_result.second->get_nullmap(), &ptr, + const auto* null_map = converted_result.second->get_nullmap(); + if (null_map) { + RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), null_map, &ptr, block->rows())); } else { RETURN_IF_ERROR(_add_data(column_name, writer_sign, field.get(), &ptr, block->rows())); @@ -606,18 +606,6 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, const std::pair<int64_t, int64_t>& index_writer_sign, Field* field, const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - size_t offset = 0; - auto next_run_step = [&]() { - size_t step = 1; - for (auto i = offset + 1; i < num_rows; ++i) { - if (null_map[offset] == null_map[i]) { - step++; - } else { - break; - } - } - return step; - }; // TODO: need to process null data for inverted index if (field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { DCHECK(field->get_sub_field_count() == 1); @@ -638,20 +626,27 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_add_array_values_error", { _CLTHROWA(CL_ERR_IO, "debug point: _add_nullable_add_array_values_error"); }) + RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_nulls(null_map, + num_rows)); } catch (const std::exception& e) { return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( "CLuceneError occured: {}", e.what()); } - // we should refresh nullmap for array - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map && null_map[row_id] == 1) { - RETURN_IF_ERROR( - _inverted_index_builders[index_writer_sign]->add_array_nulls(row_id)); - } - } + return Status::OK(); } - + size_t offset = 0; + auto next_run_step = [&]() { + size_t step = 1; + for (auto i = offset + 1; i < num_rows; ++i) { + if (null_map[offset] == null_map[i]) { + step++; + } else { + break; + } + } + return step; + }; try { do { auto step = next_run_step(); diff --git a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp index c576097aa5d..c1a24456955 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp @@ -18,10 +18,13 @@ #include <CLucene.h> #include <CLucene/config/repl_wchar.h> #include <CLucene/index/IndexReader.h> +#include <gen_cpp/olap_file.pb.h> #include <gtest/gtest-message.h> #include <gtest/gtest-test-part.h> +#include <gtest/gtest.h> #include <string.h> +#include <map> #include <memory> #include <string> @@ -30,6 +33,7 @@ #include "io/fs/local_file_system.h" #include "olap/rowset/segment_v2/inverted_index_compound_reader.h" #include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "olap/rowset/segment_v2/inverted_index_file_reader.h" #include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" @@ -37,6 +41,7 @@ #include "olap/tablet_schema.h" #include "olap/tablet_schema_helper.h" #include "runtime/exec_env.h" +#include "util/faststring.h" #include "util/slice.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" @@ -55,11 +60,29 @@ namespace doris { namespace segment_v2 { class InvertedIndexArrayTest : public testing::Test { + using ExpectedDocMap = std::map<std::string, std::vector<int>>; + public: const std::string kTestDir = "./ut_dir/inverted_index_array_test"; - void check_terms_stats(string file_str) { - std::unique_ptr<DorisCompoundReader> reader; + void check_terms_stats(std::string index_prefix, ExpectedDocMap* expected, + std::vector<int> expected_null_bitmap = {}, + InvertedIndexStorageFormatPB format = InvertedIndexStorageFormatPB::V1, + const TabletIndex* index_meta = nullptr) { + std::string file_str; + if (format == InvertedIndexStorageFormatPB::V1) { + file_str = InvertedIndexDescriptor::get_index_file_path_v1(index_prefix, + index_meta->index_id(), ""); + } else if (format == InvertedIndexStorageFormatPB::V2) { + file_str = InvertedIndexDescriptor::get_index_file_path_v2(index_prefix); + } + std::unique_ptr<InvertedIndexFileReader> reader = std::make_unique<InvertedIndexFileReader>( + io::global_local_filesystem(), index_prefix, format); + auto st = reader->init(); + EXPECT_EQ(st, Status::OK()); + auto result = reader->open(index_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); try { CLuceneError err; CL_NS(store)::IndexInput* index_input = nullptr; @@ -68,14 +91,33 @@ public: if (!ok) { throw err; } - reader = std::make_unique<DorisCompoundReader>(index_input, 4096); - } catch (...) { - EXPECT_TRUE(false); + + std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); + const char* null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name)) { + std::unique_ptr<lucene::store::IndexInput> null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name, null_bitmap_in, err, + 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast<char*>(buf.data()), false); + EXPECT_TRUE(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); } std::cout << "Term statistics for " << file_str << std::endl; std::cout << "==================================" << std::endl; - lucene::store::Directory* dir = reader.get(); + lucene::store::Directory* dir = compound_reader.get(); IndexReader* r = IndexReader::open(dir); @@ -90,15 +132,31 @@ public: lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength()); printf("Term: %s ", token.c_str()); + if (expected) { + auto it = expected->find(token); + if (it != expected->end()) { + TermDocs* td = r->termDocs(te->term(false)); + std::vector<int> actual_docs; + while (td->next()) { + actual_docs.push_back(td->doc()); + } + td->close(); + _CLLDELETE(td); + EXPECT_EQ(actual_docs, it->second) << "Term: " << token; + } + } printf("Freq: %d\n", te->docFreq()); } printf("Term count: %d\n\n", nterms); + if (expected) { + ASSERT_EQ(nterms, expected->size()); + } te->close(); _CLLDELETE(te); r->close(); _CLLDELETE(r); - reader->close(); + compound_reader->close(); } void SetUp() override { @@ -121,6 +179,114 @@ public: EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); } + // create a TabletSchema with an array column (and a normal int column as key) + TabletSchemaSPtr create_schema_with_array(KeysType keys_type = DUP_KEYS) { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(keys_type); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_string"); + child.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + return tablet_schema; + } + + void test_non_null_string(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Construct two arrays: The first row is ["amory","doris"], and the second row is ["amory", "commiter"] + vectorized::Array a1, a2; + a1.push_back("amory"); + a1.push_back("doris"); + a2.push_back("amory"); + a2.push_back("commiter"); + + // Construct array type: DataTypeArray(DataTypeString) + vectorized::DataTypePtr s1 = std::make_shared<vectorized::DataTypeString>(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); + col->insert(a1); + col->insert(a2); + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + void test_string(std::string_view rowset_id, int seg_id, Field* field) { EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( @@ -147,9 +313,8 @@ public: EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, index_file_writer.get(), &idx_meta), Status::OK()); - vectorized::PaddedPODArray<Slice> _slice; - _slice.resize(5); + // Construct two arrays: The first row is ["amory","doris"], and the second row is [NULL, "amory", "commiter"] vectorized::Array a1, a2; a1.push_back("amory"); a1.push_back("doris"); @@ -157,36 +322,644 @@ public: a2.push_back("amory"); a2.push_back("commiter"); + // Construct array type: DataTypeArray(DataTypeNullable(DataTypeString)) vectorized::DataTypePtr s1 = std::make_shared<vectorized::DataTypeNullable>( std::make_shared<vectorized::DataTypeString>()); - vectorized::DataTypePtr au = std::make_shared<vectorized::DataTypeArray>(s1); - vectorized::MutableColumnPtr col = au->create_column(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); col->insert(a1); col->insert(a2); vectorized::ColumnPtr column_array = std::move(col); - vectorized::ColumnWithTypeAndName type_and_name(column_array, au, "arr1"); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + + void test_null_write_v2(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = InvertedIndexDescriptor::get_index_file_path_v2(index_path_prefix); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + io::FileWriterPtr file_writer; + io::FileWriterOptions opts; + Status sts = fs->create_file(index_path, &file_writer, &opts); + ASSERT_TRUE(sts.ok()); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V2, std::move(file_writer)); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector<uint8_t> outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); + + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector<int> expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V2, &idx_meta); + } + + void test_null_write(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector<uint8_t> outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); - vectorized::PaddedPODArray<vectorized::UInt64> _offsets; - _offsets.reserve(3); - _offsets.emplace_back(0); - _offsets.emplace_back(2); - _offsets.emplace_back(5); - const uint8_t* offsets_ptr = (const uint8_t*)(_offsets.data()); + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); - auto* col_arr = assert_cast<const vectorized::ColumnArray*>(column_array.get()); + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector<int> expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_multi_block_write(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, index_path_prefix, "multi_block", 0, InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + ExpectedDocMap merged_expected; + + // --- Block 1 --- + { + const int row_num = 4; + // construct data type: Nullable( Array( Nullable(String) ) ) + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // construct MutableColumn + vectorized::MutableColumnPtr col = final_type->create_column(); + // simulate outer null: row0 and row3 are null, the rest are non-null + col->insert(vectorized::Null()); // row0: null + { + // row1: non-null, array with 1 element: "block1_data1" + vectorized::Array arr; + arr.push_back("block1_data1"); + col->insert(arr); + } + { + // row2: non-null, array with 1 element: "block1_data2" + vectorized::Array arr; + arr.push_back("block1_data2"); + col->insert(arr); + } + col->insert(vectorized::Null()); // row3: null + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // construct Block (containing only the arr1 column) + vectorized::Block block; + block.insert(type_and_name); + + // use TabletSchema containing the array column (arr1 is the non-key column with index 1 in the schema) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // convert the arr1 column in the block + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + // for Block1, the expected non-null behavior is row1 and row2 + ExpectedDocMap expected = {{"block1_data1", {1}}, {"block1_data2", {2}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 2 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block2_data1" + { + vectorized::Array arr; + arr.push_back("block2_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block2_data1", {4}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 3 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block3_data1" + { + vectorized::Array arr; + arr.push_back("block3_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block3_data1", {6}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + std::vector<int> expected_null_bitmap = {0, 3, 5, 7}; + check_terms_stats(index_path_prefix, &merged_expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_array_numeric(std::string_view rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + std::string index_path_prefix {InvertedIndexDescriptor::get_index_file_path_prefix( + local_segment_path(kTestDir, rowset_id, seg_id))}; + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_path_v1(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr_numeric"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, index_path_prefix, std::string {rowset_id}, seg_id, + InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + vectorized::DataTypePtr inner_int = std::make_shared<vectorized::DataTypeInt32>(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(inner_int); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // create a MutableColumnPtr + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array [123, 456] + { + vectorized::Array arr; + arr.push_back(123); + arr.push_back(456); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + // row2: non-null, array [789, 101112] + { + vectorized::Array arr; + arr.push_back(789); + arr.push_back(101112); + col->insert(arr); + } + // wrap the constructed column into a ColumnWithTypeAndName + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr_num"); + + // construct Block (containing only this column), with 3 rows + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_int"); + child.set_type(FieldType::OLAP_FIELD_TYPE_INT); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element total count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // get the size of the sub field (4 bytes for INT type) + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // expected inverted index: row0 contains "123" and "456" (doc id 0), row1 is null, row2 contains "789" and "101112" (doc id 2) + ExpectedDocMap expected = {{"123", {0}}, {"456", {0}}, {"789", {2}}, {"101112", {2}}}; + std::vector<int> expected_null_bitmap = {1}; + + std::unique_ptr<InvertedIndexFileReader> reader = std::make_unique<InvertedIndexFileReader>( + io::global_local_filesystem(), index_path_prefix, InvertedIndexStorageFormatPB::V1); + auto sts = reader->init(); + EXPECT_EQ(sts, Status::OK()); + auto result = reader->open(&idx_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); + try { + CLuceneError err; + CL_NS(store)::IndexInput* index_input = nullptr; + auto ok = DorisFSDirectory::FSIndexInput::open( + io::global_local_filesystem(), index_path.c_str(), index_input, err, 4096); + if (!ok) { + throw err; + } + + std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); + const char* null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name)) { + std::unique_ptr<lucene::store::IndexInput> null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name, null_bitmap_in, err, + 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast<char*>(buf.data()), false); + assert(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); + } + } + +private: + static void build_slices(vectorized::PaddedPODArray<Slice>& slices, + const vectorized::ColumnPtr& column_array, size_t num_strings) { + const auto* col_arr = assert_cast<const vectorized::ColumnArray*>(column_array.get()); const vectorized::UInt8* nested_null_map = assert_cast<const vectorized::ColumnNullable*>(col_arr->get_data_ptr().get()) - ->get_null_map_data() + ->get_null_map_column() + .get_data() .data(); - auto* col_arr_str = assert_cast<const vectorized::ColumnString*>( + const auto* col_arr_str = assert_cast<const vectorized::ColumnString*>( assert_cast<const vectorized::ColumnNullable*>(col_arr->get_data_ptr().get()) ->get_nested_column_ptr() .get()); const char* char_data = (const char*)(col_arr_str->get_chars().data()); const vectorized::ColumnString::Offset* offset_cur = col_arr_str->get_offsets().data(); - const vectorized::ColumnString::Offset* offset_end = offset_cur + 5; - - Slice* slice = _slice.data(); + const vectorized::ColumnString::Offset* offset_end = offset_cur + num_strings; + Slice* slice = slices.data(); size_t string_offset = *(offset_cur - 1); const vectorized::UInt8* nullmap_cur = nested_null_map; while (offset_cur != offset_end) { @@ -202,16 +975,6 @@ public: ++slice; ++offset_cur; } - - auto field_size = field->get_sub_field(0)->size(); - Status st = _inverted_index_builder->add_array_values( - field_size, reinterpret_cast<const void*>(_slice.data()), - reinterpret_cast<const uint8_t*>(nested_null_map), offsets_ptr, 2); - EXPECT_EQ(st, Status::OK()); - EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); - EXPECT_EQ(index_file_writer->close(), Status::OK()); - - check_terms_stats(index_path); } }; @@ -227,8 +990,54 @@ TEST_F(InvertedIndexArrayTest, ArrayString) { arrayTabletColumn.add_sub_column(arraySubColumn); Field* field = FieldFactory::create(arrayTabletColumn); test_string("rowset_id", 0, field); + test_non_null_string("rowset_id_non_null", 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, ComplexNullCases) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_null_write("complex_null", 0, field); + test_null_write_v2("complex_null_v2", 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, MultiBlockWrite) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_multi_block_write("multi_block", 0, field); delete field; } +TEST_F(InvertedIndexArrayTest, ArrayInt) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_int"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_INT); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_array_numeric("int_test", 0, field); + delete field; +} } // namespace segment_v2 -} // namespace doris +} // namespace doris \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org