This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 1aa57a3b130 branch-2.1: [fix](array index) Correct null bitmap writing for inverted index #47846 (#48214) 1aa57a3b130 is described below commit 1aa57a3b130795b21def9a122ab00f2ff843cfcd Author: airborne12 <jiang...@selectdb.com> AuthorDate: Tue Feb 25 20:31:18 2025 +0800 branch-2.1: [fix](array index) Correct null bitmap writing for inverted index #47846 (#48214) cherry pick from #47846 #48231 --- be/src/olap/rowset/segment_v2/column_writer.cpp | 10 +- .../rowset/segment_v2/inverted_index_writer.cpp | 57 +- .../olap/rowset/segment_v2/inverted_index_writer.h | 2 +- be/src/olap/task/index_builder.cpp | 57 +- .../segment_v2/inverted_index_array_test.cpp | 1005 +++++++++++++++++++- .../inverted_index_p0/test_add_index_for_arr.out | Bin 0 -> 187 bytes .../test_add_index_for_arr.groovy | 43 +- 7 files changed, 1065 insertions(+), 109 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/column_writer.cpp b/be/src/olap/rowset/segment_v2/column_writer.cpp index bdbfcdc2d41..7c71c55598f 100644 --- a/be/src/olap/rowset/segment_v2/column_writer.cpp +++ b/be/src/olap/rowset/segment_v2/column_writer.cpp @@ -507,7 +507,9 @@ Status ScalarColumnWriter::init() { return Status::OK(); } Status add_nulls(uint32_t count) override { return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { return Status::OK(); } + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + return Status::OK(); + } Status finish() override { return Status::OK(); } int64_t size() const override { return 0; } int64_t file_size() const override { return 0; } @@ -1018,11 +1020,7 @@ Status ArrayColumnWriter::append_nullable(const uint8_t* null_map, const uint8_t RETURN_IF_ERROR(append_data(ptr, num_rows)); if (is_nullable()) { if (_opts.need_inverted_index) { - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map[row_id] == 1) { - RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(row_id)); - } - } + RETURN_IF_ERROR(_inverted_index_builder->add_array_nulls(null_map, num_rows)); } RETURN_IF_ERROR(_null_writer->append_data(&null_map, num_rows)); } diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp index 4e503685e68..64c373db166 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.cpp @@ -323,8 +323,26 @@ public: return Status::OK(); } - Status add_array_nulls(uint32_t row_id) override { - _null_bitmap.add(row_id); + Status add_array_nulls(const uint8_t* null_map, size_t num_rows) override { + DCHECK(_rid >= num_rows); + if (num_rows == 0 || null_map == nullptr) { + return Status::OK(); + } + std::vector<uint32_t> null_indices; + null_indices.reserve(num_rows / 8); + + // because _rid is the row id in block, not segment, and we add data before we add nulls, + // so we need to subtract num_rows to get the row id in segment + for (size_t i = 0; i < num_rows; i++) { + if (null_map[i] == 1) { + null_indices.push_back(_rid - num_rows + static_cast<uint32_t>(i)); + } + } + + if (!null_indices.empty()) { + _null_bitmap.addMany(null_indices.size(), null_indices.data()); + } + return Status::OK(); } @@ -384,8 +402,11 @@ public: return Status::OK(); } - Status add_array_values(size_t field_size, const void* value_ptr, const uint8_t* null_map, - const uint8_t* offsets_ptr, size_t count) override { + Status add_array_values(size_t field_size, const void* value_ptr, + const uint8_t* nested_null_map, const uint8_t* offsets_ptr, + size_t count) override { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_count_is_zero", + { count = 0; }) if (count == 0) { // no values to add inverted index return Status::OK(); @@ -408,7 +429,7 @@ public: lucene::document::Field* new_field = nullptr; CL_NS(analysis)::TokenStream* ts = nullptr; for (auto j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } auto* v = (Slice*)((const uint8_t*)value_ptr + j * field_size); @@ -471,7 +492,7 @@ public: for (int i = 0; i < count; ++i) { auto array_elem_size = offsets[i + 1] - offsets[i]; for (size_t j = start_off; j < start_off + array_elem_size; ++j) { - if (null_map[j] == 1) { + if (nested_null_map && nested_null_map[j] == 1) { continue; } const CppType* p = &reinterpret_cast<const CppType*>(value_ptr)[j]; @@ -488,6 +509,12 @@ public: Status add_array_values(size_t field_size, const CollectionValue* values, size_t count) override { if constexpr (field_is_slice_type(field_type)) { + DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_array_values_field_is_nullptr", + { _field = nullptr; }) + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_array_values_index_writer_is_" + "nullptr", + { _index_writer = nullptr; }) if (_field == nullptr || _index_writer == nullptr) { LOG(ERROR) << "field or index writer is null in inverted index writer."; return Status::InternalError( @@ -548,9 +575,10 @@ public: std::string new_value; size_t value_length = sizeof(CppType); - DBUG_EXECUTE_IF("InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_error", { - _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); - }); + DBUG_EXECUTE_IF( + "InvertedIndexColumnWriterImpl::add_value_bkd_writer_add_throw_" + "error", + { _CLTHROWA(CL_ERR_IllegalArgument, ("packedValue should be length=xxx")); }); _value_key_coder->full_encode_ascending(&value, &new_value); _bkd_writer->add((const uint8_t*)new_value.c_str(), value_length, _rid); @@ -614,8 +642,8 @@ public: _bkd_writer->finish(data_out.get(), index_out.get()), int(field_type)); } else { - LOG(WARNING) - << "Inverted index writer create output error occurred: nullptr"; + LOG(WARNING) << "Inverted index writer create output error " + "occurred: nullptr"; _CLTHROWA(CL_ERR_IO, "Create output error with nullptr"); } meta_out->close(); @@ -630,9 +658,12 @@ public: write_null_bitmap(null_bitmap_out.get()); close(); DBUG_EXECUTE_IF( - "InvertedIndexWriter._throw_clucene_error_in_fulltext_writer_close", { + "InvertedIndexWriter._throw_clucene_error_in_fulltext_" + "writer_close", + { _CLTHROWA(CL_ERR_IO, - "debug point: test throw error in fulltext index writer"); + "debug point: test throw error in fulltext " + "index writer"); }); } } catch (CLuceneError& e) { diff --git a/be/src/olap/rowset/segment_v2/inverted_index_writer.h b/be/src/olap/rowset/segment_v2/inverted_index_writer.h index 134dc32287c..45b19263bca 100644 --- a/be/src/olap/rowset/segment_v2/inverted_index_writer.h +++ b/be/src/olap/rowset/segment_v2/inverted_index_writer.h @@ -64,7 +64,7 @@ public: size_t count) = 0; virtual Status add_nulls(uint32_t count) = 0; - virtual Status add_array_nulls(uint32_t row_id) = 0; + virtual Status add_array_nulls(const uint8_t* null_map, size_t num_rows) = 0; virtual Status finish() = 0; diff --git a/be/src/olap/task/index_builder.cpp b/be/src/olap/task/index_builder.cpp index 68232326b0b..94feffcc059 100644 --- a/be/src/olap/task/index_builder.cpp +++ b/be/src/olap/task/index_builder.cpp @@ -509,9 +509,9 @@ Status IndexBuilder::_write_inverted_index_data(TabletSchemaSPtr tablet_schema, return converted_result.first; } const auto* ptr = (const uint8_t*)converted_result.second->get_data(); - if (converted_result.second->get_nullmap()) { - RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), - converted_result.second->get_nullmap(), &ptr, + const auto* null_map = converted_result.second->get_nullmap(); + if (null_map) { + RETURN_IF_ERROR(_add_nullable(column_name, writer_sign, field.get(), null_map, &ptr, block->rows())); } else { RETURN_IF_ERROR(_add_data(column_name, writer_sign, field.get(), &ptr, block->rows())); @@ -526,49 +526,44 @@ Status IndexBuilder::_add_nullable(const std::string& column_name, const std::pair<int64_t, int64_t>& index_writer_sign, Field* field, const uint8_t* null_map, const uint8_t** ptr, size_t num_rows) { - size_t offset = 0; - auto next_run_step = [&]() { - size_t step = 1; - for (auto i = offset + 1; i < num_rows; ++i) { - if (null_map[offset] == null_map[i]) { - step++; - } else { - break; - } - } - return step; - }; // TODO: need to process null data for inverted index if (field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY) { DCHECK(field->get_sub_field_count() == 1); // [size, offset_ptr, item_data_ptr, item_nullmap_ptr] const auto* data_ptr = reinterpret_cast<const uint64_t*>(*ptr); // total number length - auto element_cnt = size_t((unsigned long)(*data_ptr)); auto offset_data = *(data_ptr + 1); const auto* offsets_ptr = (const uint8_t*)offset_data; try { - if (element_cnt > 0) { - auto data = *(data_ptr + 2); - auto nested_null_map = *(data_ptr + 3); - RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_values( - field->get_sub_field(0)->size(), reinterpret_cast<const void*>(data), - reinterpret_cast<const uint8_t*>(nested_null_map), offsets_ptr, num_rows)); - } + auto data = *(data_ptr + 2); + auto nested_null_map = *(data_ptr + 3); + RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_values( + field->get_sub_field(0)->size(), reinterpret_cast<const void*>(data), + reinterpret_cast<const uint8_t*>(nested_null_map), offsets_ptr, num_rows)); + DBUG_EXECUTE_IF("IndexBuilder::_add_nullable_add_array_values_error", { + _CLTHROWA(CL_ERR_IO, "debug point: _add_nullable_add_array_values_error"); + }) + RETURN_IF_ERROR(_inverted_index_builders[index_writer_sign]->add_array_nulls(null_map, + num_rows)); } catch (const std::exception& e) { return Status::Error<ErrorCode::INVERTED_INDEX_CLUCENE_ERROR>( "CLuceneError occured: {}", e.what()); } - // we should refresh nullmap for array - for (int row_id = 0; row_id < num_rows; row_id++) { - if (null_map && null_map[row_id] == 1) { - RETURN_IF_ERROR( - _inverted_index_builders[index_writer_sign]->add_array_nulls(row_id)); - } - } + return Status::OK(); } - + size_t offset = 0; + auto next_run_step = [&]() { + size_t step = 1; + for (auto i = offset + 1; i < num_rows; ++i) { + if (null_map[offset] == null_map[i]) { + step++; + } else { + break; + } + } + return step; + }; try { do { auto step = next_run_step(); diff --git a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp index 74e9827db25..aac5f3c1c7d 100644 --- a/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp +++ b/be/test/olap/rowset/segment_v2/inverted_index_array_test.cpp @@ -18,17 +18,24 @@ #include <CLucene.h> #include <CLucene/config/repl_wchar.h> #include <CLucene/index/IndexReader.h> +#include <gen_cpp/olap_file.pb.h> #include <gtest/gtest-message.h> #include <gtest/gtest-test-part.h> +#include <gtest/gtest.h> #include <string.h> +#include <map> #include <memory> #include <string> #include "gtest/gtest_pred_impl.h" #include "io/fs/file_writer.h" #include "io/fs/local_file_system.h" +#include "io/fs/path.h" +#include "olap/rowset/beta_rowset.h" #include "olap/rowset/segment_v2/inverted_index_compound_reader.h" +#include "olap/rowset/segment_v2/inverted_index_desc.h" +#include "olap/rowset/segment_v2/inverted_index_file_reader.h" #include "olap/rowset/segment_v2/inverted_index_file_writer.h" #include "olap/rowset/segment_v2/inverted_index_fs_directory.h" #include "olap/rowset/segment_v2/inverted_index_writer.h" @@ -36,6 +43,7 @@ #include "olap/tablet_schema.h" #include "olap/tablet_schema_helper.h" #include "runtime/exec_env.h" +#include "util/faststring.h" #include "util/slice.h" #include "vec/columns/column_array.h" #include "vec/columns/column_nullable.h" @@ -50,20 +58,68 @@ using namespace lucene::index; using doris::segment_v2::InvertedIndexFileWriter; -namespace doris { -namespace segment_v2 { +namespace doris::segment_v2 { class InvertedIndexArrayTest : public testing::Test { + using ExpectedDocMap = std::map<std::string, std::vector<int>>; + public: const std::string kTestDir = "./ut_dir/inverted_index_array_test"; - void check_terms_stats(string dir_str, string file_str) { - auto fs = io::global_local_filesystem(); - std::unique_ptr<DorisCompoundReader> reader = std::make_unique<DorisCompoundReader>( - DorisFSDirectoryFactory::getDirectory(fs, dir_str.c_str()), file_str.c_str(), 4096); + void check_terms_stats(std::string index_prefix, ExpectedDocMap* expected, + std::vector<int> expected_null_bitmap = {}, + InvertedIndexStorageFormatPB format = InvertedIndexStorageFormatPB::V1, + const TabletIndex* index_meta = nullptr) { + std::string file_str; + if (format == InvertedIndexStorageFormatPB::V1) { + file_str = InvertedIndexDescriptor::get_index_file_name(index_prefix, + index_meta->index_id(), ""); + } else if (format == InvertedIndexStorageFormatPB::V2) { + file_str = InvertedIndexDescriptor::get_index_file_name(index_prefix); + } + io::Path path(index_prefix); + std::unique_ptr<InvertedIndexFileReader> reader = std::make_unique<InvertedIndexFileReader>( + io::global_local_filesystem(), path.parent_path(), path.filename(), format); + auto st = reader->init(); + EXPECT_EQ(st, Status::OK()); + auto result = reader->open(index_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); + try { + CLuceneError err; + CL_NS(store)::IndexInput* index_input = nullptr; + auto ok = DorisFSDirectory::FSIndexInput::open( + io::global_local_filesystem(), file_str.c_str(), index_input, err, 4096); + if (!ok) { + throw err; + } + + std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); + auto null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name.c_str())) { + std::unique_ptr<lucene::store::IndexInput> null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name.c_str(), null_bitmap_in, + err, 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast<char*>(buf.data()), false); + EXPECT_TRUE(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); + } + std::cout << "Term statistics for " << file_str << std::endl; std::cout << "==================================" << std::endl; - lucene::store::Directory* dir = reader.get(); + lucene::store::Directory* dir = compound_reader.get(); IndexReader* r = IndexReader::open(dir); @@ -78,15 +134,31 @@ public: lucene_wcstoutf8string(te->term(false)->text(), te->term(false)->textLength()); printf("Term: %s ", token.c_str()); + if (expected) { + auto it = expected->find(token); + if (it != expected->end()) { + TermDocs* td = r->termDocs(te->term(false)); + std::vector<int> actual_docs; + while (td->next()) { + actual_docs.push_back(td->doc()); + } + td->close(); + _CLLDELETE(td); + EXPECT_EQ(actual_docs, it->second) << "Term: " << token; + } + } printf("Freq: %d\n", te->docFreq()); } printf("Term count: %d\n\n", nterms); + if (expected) { + ASSERT_EQ(nterms, expected->size()); + } te->close(); _CLLDELETE(te); r->close(); _CLLDELETE(r); - reader->close(); + compound_reader->close(); } void SetUp() override { @@ -99,7 +171,7 @@ public: paths.emplace_back(kTestDir, 1024); auto tmp_file_dirs = std::make_unique<segment_v2::TmpFileDirs>(paths); st = tmp_file_dirs->init(); - if (!st.OK()) { + if (!st.ok()) { std::cout << "init tmp file dirs error:" << st.to_string() << std::endl; return; } @@ -109,16 +181,43 @@ public: EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kTestDir).ok()); } - void test_string(std::string testname, Field* field) { + // create a TabletSchema with an array column (and a normal int column as key) + TabletSchemaSPtr create_schema_with_array(KeysType keys_type = DUP_KEYS) { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(keys_type); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_string"); + child.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + return tablet_schema; + } + + void test_non_null_string(int64_t rowset_id, int seg_id, Field* field) { EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); - std::string filename = kTestDir + "/" + testname; + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); auto fs = io::global_local_filesystem(); - io::FileWriterPtr file_writer; - EXPECT_TRUE(fs->create_file(filename, &file_writer).ok()); auto index_meta_pb = std::make_unique<TabletIndexPB>(); index_meta_pb->set_index_type(IndexType::INVERTED); - index_meta_pb->set_index_id(26033); + index_meta_pb->set_index_id(index_id); index_meta_pb->set_index_name("index_inverted_arr1"); index_meta_pb->clear_col_unique_id(); index_meta_pb->add_col_unique_id(0); @@ -126,16 +225,100 @@ public: TabletIndex idx_meta; idx_meta.index_type(); idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( - fs, file_writer->path().parent_path(), file_writer->path().filename(), - InvertedIndexStorageFormatPB::V1); + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, index_file_writer.get(), &idx_meta), Status::OK()); - vectorized::PaddedPODArray<Slice> _slice; - _slice.resize(5); + // Construct two arrays: The first row is ["amory","doris"], and the second row is ["amory", "commiter"] + vectorized::Array a1, a2; + a1.push_back("amory"); + a1.push_back("doris"); + a2.push_back("amory"); + a2.push_back("commiter"); + + // Construct array type: DataTypeArray(DataTypeString) + vectorized::DataTypePtr s1 = std::make_shared<vectorized::DataTypeString>(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); + col->insert(a1); + col->insert(a2); + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + + void test_string(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Construct two arrays: The first row is ["amory","doris"], and the second row is [NULL, "amory", "commiter"] vectorized::Array a1, a2; a1.push_back("amory"); a1.push_back("doris"); @@ -143,36 +326,725 @@ public: a2.push_back("amory"); a2.push_back("commiter"); + // Construct array type: DataTypeArray(DataTypeNullable(DataTypeString)) vectorized::DataTypePtr s1 = std::make_shared<vectorized::DataTypeNullable>( std::make_shared<vectorized::DataTypeString>()); - vectorized::DataTypePtr au = std::make_shared<vectorized::DataTypeArray>(s1); - vectorized::MutableColumnPtr col = au->create_column(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(s1); + vectorized::MutableColumnPtr col = array_type->create_column(); col->insert(a1); col->insert(a2); vectorized::ColumnPtr column_array = std::move(col); - vectorized::ColumnWithTypeAndName type_and_name(column_array, au, "arr1"); + vectorized::ColumnWithTypeAndName type_and_name(column_array, array_type, "arr1"); + + // Put the array column into the Block (assuming only this column) + vectorized::Block block; + block.insert(type_and_name); + // block.rows() should be 2 + + // Use OlapBlockDataConvertor to convert + // Note: Here we need a TabletSchema object, in this example we construct a simple schema, + // Assuming that the 0th column in the schema is our array column (the actual UT has the corresponding TabletColumn) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // The conversion result is actually an array of 4 pointers: + // [0]: Total number of elements (elem_cnt) + // [1]: Offsets array pointer + // [2]: Nested item data pointer + // [3]: Nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Get the length of the subfield, used for inverted index writing + auto field_size = field->get_sub_field(0)->size(); + // Call the inverted index writing interface, passing in item_data, item_nullmap, offsets_ptr, and the number of rows (the number of array rows in the Block) + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + ExpectedDocMap expected = {{"amory", {0, 1}}, {"doris", {0}}, {"commiter", {1}}}; + check_terms_stats(index_path_prefix, &expected, {}, InvertedIndexStorageFormatPB::V1, + &idx_meta); + } + + void test_null_write_v2(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = InvertedIndexDescriptor::get_index_file_name(index_path_prefix); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V2); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector<uint8_t> outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); + + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector<int> expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V2, &idx_meta); + } + + void test_null_write(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); - vectorized::PaddedPODArray<vectorized::UInt64> _offsets; - _offsets.reserve(3); - _offsets.emplace_back(0); - _offsets.emplace_back(2); - _offsets.emplace_back(5); - const uint8_t* offsets_ptr = (const uint8_t*)(_offsets.data()); + TabletIndex idx_meta; + idx_meta.index_type(); + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); - auto* col_arr = assert_cast<const vectorized::ColumnArray*>(column_array.get()); + // Simulate outer null cases: 5 rows, outer null map = {1, 0, 0, 1, 0}, i.e., rows 0 and 3 are null + std::vector<uint8_t> outer_null_map = {1, 0, 0, 1, 0}; + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // Construct 5 rows of data: + // Row 0: null + // Row 1: a2 = [Null, "test"] + // Row 2: a3 = ["mixed", Null, "data"] + // Row 3: null + // Row 4: a5 = ["non-null"] + vectorized::MutableColumnPtr col = final_type->create_column(); + // Row 0: insert null + col->insert(vectorized::Null()); + // Row 1: insert a2 + vectorized::Array a2; + a2.push_back(vectorized::Null()); + a2.push_back("test"); + col->insert(a2); + // Row 2: insert a3 + vectorized::Array a3; + a3.push_back("mixed"); + a3.push_back(vectorized::Null()); + a3.push_back("data"); + col->insert(a3); + // Row 3: insert null + col->insert(vectorized::Null()); + // Row 4: insert a5 + vectorized::Array a5; + a5.push_back("non-null"); + col->insert(a5); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // Construct Block, containing only the array column, with 5 rows + vectorized::Block block; + block.insert(type_and_name); + + // Construct TabletSchema (containing the array column) - reference the existing helper function + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + // In this schema, assume the 0th column is the key, and the arr1 column is the non-key column with index 1 + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // Convert array column data + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // OlapColumnDataConvertorArray conversion result is a 4-tuple: + // [0]: element total count (elem_cnt, not used directly) + // [1]: offsets array pointer + // [2]: nested item data conversion result pointer + // [3]: nested nullmap pointer + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // Call the inverted index writing interface, passing in the converted nested data, nullmap, and offsets + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // Expected inverted index result: only index non-null elements + // Row 1: non-null in a2 is "test" + // Row 2: non-null in a3 is "mixed" and "data" + // Row 4: non-null in a5 is "non-null" + ExpectedDocMap expected = {{"test", {1}}, {"mixed", {2}}, {"data", {2}}, {"non-null", {4}}}; + std::vector<int> expected_null_bitmap = {0, 3}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_multi_block_write(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr1"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + ExpectedDocMap merged_expected; + + // --- Block 1 --- + { + const int row_num = 4; + // construct data type: Nullable( Array( Nullable(String) ) ) + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // construct MutableColumn + vectorized::MutableColumnPtr col = final_type->create_column(); + // simulate outer null: row0 and row3 are null, the rest are non-null + col->insert(vectorized::Null()); // row0: null + { + // row1: non-null, array with 1 element: "block1_data1" + vectorized::Array arr; + arr.push_back("block1_data1"); + col->insert(arr); + } + { + // row2: non-null, array with 1 element: "block1_data2" + vectorized::Array arr; + arr.push_back("block1_data2"); + col->insert(arr); + } + col->insert(vectorized::Null()); // row3: null + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + // construct Block (containing only the arr1 column) + vectorized::Block block; + block.insert(type_and_name); + + // use TabletSchema containing the array column (arr1 is the non-key column with index 1 in the schema) + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + // convert the arr1 column in the block + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + // for Block1, the expected non-null behavior is row1 and row2 + ExpectedDocMap expected = {{"block1_data1", {1}}, {"block1_data2", {2}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 2 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block2_data1" + { + vectorized::Array arr; + arr.push_back("block2_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block2_data1", {4}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + // --- Block 3 --- + { + const int row_num = 2; + vectorized::DataTypePtr inner_string = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array with 1 element: "block3_data1" + { + vectorized::Array arr; + arr.push_back("block3_data1"); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, row_num); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, row_num); + EXPECT_EQ(st, Status::OK()); + + ExpectedDocMap expected = {{"block3_data1", {6}}}; + merged_expected.insert(expected.begin(), expected.end()); + } + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + std::vector<int> expected_null_bitmap = {0, 3, 5, 7}; + check_terms_stats(index_path_prefix, &merged_expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_array_all_null(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26034; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr_all_null"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + // Construct inner array type: DataTypeArray(DataTypeNullable(DataTypeString)) + vectorized::DataTypePtr inner_string_type = std::make_shared<vectorized::DataTypeNullable>( + std::make_shared<vectorized::DataTypeString>()); + vectorized::DataTypePtr array_type = + std::make_shared<vectorized::DataTypeArray>(inner_string_type); + // To support outer array null values, wrap it in a Nullable type + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + vectorized::MutableColumnPtr col = final_type->create_column(); + col->insert(vectorized::Null()); + col->insert(vectorized::Null()); + + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr1"); + + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = create_schema_with_array(); + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + const auto* null_map = accessor->get_nullmap(); + + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + std::vector<int> expected_null_bitmap = {0, 1}; + ExpectedDocMap expected {}; + check_terms_stats(index_path_prefix, &expected, expected_null_bitmap, + InvertedIndexStorageFormatPB::V1, &idx_meta); + } + + void test_array_numeric(int64_t rowset_id, int seg_id, Field* field) { + EXPECT_TRUE(field->type() == FieldType::OLAP_FIELD_TYPE_ARRAY); + RowsetId rowset_id_obj; + rowset_id_obj.init(rowset_id); + std::string index_path_prefix = + BetaRowset::segment_file_path(kTestDir, rowset_id_obj, seg_id); + int index_id = 26033; + std::string index_path = + InvertedIndexDescriptor::get_index_file_name(index_path_prefix, index_id, ""); + auto fs = io::global_local_filesystem(); + + auto index_meta_pb = std::make_unique<TabletIndexPB>(); + index_meta_pb->set_index_type(IndexType::INVERTED); + index_meta_pb->set_index_id(index_id); + index_meta_pb->set_index_name("index_inverted_arr_numeric"); + index_meta_pb->clear_col_unique_id(); + index_meta_pb->add_col_unique_id(0); + + TabletIndex idx_meta; + idx_meta.init_from_pb(*index_meta_pb.get()); + io::Path path(index_path_prefix); + auto index_file_writer = std::make_unique<InvertedIndexFileWriter>( + fs, path.parent_path(), path.filename(), InvertedIndexStorageFormatPB::V1); + std::unique_ptr<segment_v2::InvertedIndexColumnWriter> _inverted_index_builder = nullptr; + EXPECT_EQ(InvertedIndexColumnWriter::create(field, &_inverted_index_builder, + index_file_writer.get(), &idx_meta), + Status::OK()); + + vectorized::DataTypePtr inner_int = std::make_shared<vectorized::DataTypeInt32>(); + vectorized::DataTypePtr array_type = std::make_shared<vectorized::DataTypeArray>(inner_int); + vectorized::DataTypePtr final_type = + std::make_shared<vectorized::DataTypeNullable>(array_type); + + // create a MutableColumnPtr + vectorized::MutableColumnPtr col = final_type->create_column(); + // row0: non-null, array [123, 456] + { + vectorized::Array arr; + arr.push_back(123); + arr.push_back(456); + col->insert(arr); + } + // row1: null + col->insert(vectorized::Null()); + // row2: non-null, array [789, 101112] + { + vectorized::Array arr; + arr.push_back(789); + arr.push_back(101112); + col->insert(arr); + } + // wrap the constructed column into a ColumnWithTypeAndName + vectorized::ColumnPtr column_array = std::move(col); + vectorized::ColumnWithTypeAndName type_and_name(column_array, final_type, "arr_num"); + + // construct Block (containing only this column), with 3 rows + vectorized::Block block; + block.insert(type_and_name); + + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(KeysType::DUP_KEYS); + + tablet_schema->init_from_pb(tablet_schema_pb); + TabletColumn array; + array.set_name("arr1"); + array.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + array.set_length(0); + array.set_index_length(0); + array.set_is_nullable(false); + array.set_is_bf_column(false); + TabletColumn child; + child.set_name("arr_sub_int"); + child.set_type(FieldType::OLAP_FIELD_TYPE_INT); + child.set_length(INT_MAX); + array.add_sub_column(child); + tablet_schema->append_column(array); + + vectorized::OlapBlockDataConvertor convertor(tablet_schema.get(), {0}); + convertor.set_source_content(&block, 0, block.rows()); + auto [st, accessor] = convertor.convert_column_data(0); + EXPECT_EQ(st, Status::OK()); + // the conversion result is a 4-tuple: [0]: element total count, [1]: offsets pointer, [2]: item data, [3]: item nullmap + const auto* data_ptr = reinterpret_cast<const uint64_t*>(accessor->get_data()); + const auto* offsets_ptr = reinterpret_cast<const uint8_t*>(data_ptr[1]); + const void* item_data = reinterpret_cast<const void*>(data_ptr[2]); + const auto* item_nullmap = reinterpret_cast<const uint8_t*>(data_ptr[3]); + + // get the size of the sub field (4 bytes for INT type) + auto field_size = field->get_sub_field(0)->size(); + st = _inverted_index_builder->add_array_values(field_size, item_data, item_nullmap, + offsets_ptr, block.rows()); + EXPECT_EQ(st, Status::OK()); + const auto* null_map = accessor->get_nullmap(); + // add nulls + st = _inverted_index_builder->add_array_nulls(null_map, block.rows()); + EXPECT_EQ(st, Status::OK()); + EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); + EXPECT_EQ(index_file_writer->close(), Status::OK()); + + // expected inverted index: row0 contains "123" and "456" (doc id 0), row1 is null, row2 contains "789" and "101112" (doc id 2) + ExpectedDocMap expected = {{"123", {0}}, {"456", {0}}, {"789", {2}}, {"101112", {2}}}; + std::vector<int> expected_null_bitmap = {1}; + + std::unique_ptr<InvertedIndexFileReader> reader = std::make_unique<InvertedIndexFileReader>( + io::global_local_filesystem(), path.parent_path(), path.filename(), + InvertedIndexStorageFormatPB::V1); + auto sts = reader->init(); + EXPECT_EQ(sts, Status::OK()); + auto result = reader->open(&idx_meta); + EXPECT_TRUE(result.has_value()) << "Failed to open compound reader" << result.error(); + auto compound_reader = std::move(result.value()); + try { + CLuceneError err; + CL_NS(store)::IndexInput* index_input = nullptr; + auto ok = DorisFSDirectory::FSIndexInput::open( + io::global_local_filesystem(), index_path.c_str(), index_input, err, 4096); + if (!ok) { + throw err; + } + + std::shared_ptr<roaring::Roaring> null_bitmap = std::make_shared<roaring::Roaring>(); + auto null_bitmap_file_name = + InvertedIndexDescriptor::get_temporary_null_bitmap_file_name(); + if (compound_reader->fileExists(null_bitmap_file_name.c_str())) { + std::unique_ptr<lucene::store::IndexInput> null_bitmap_in; + assert(compound_reader->openInput(null_bitmap_file_name.c_str(), null_bitmap_in, + err, 4096)); + size_t null_bitmap_size = null_bitmap_in->length(); + doris::faststring buf; + buf.resize(null_bitmap_size); + null_bitmap_in->readBytes(reinterpret_cast<uint8_t*>(buf.data()), null_bitmap_size); + *null_bitmap = roaring::Roaring::read(reinterpret_cast<char*>(buf.data()), false); + assert(expected_null_bitmap.size() == null_bitmap->cardinality()); + for (int i : expected_null_bitmap) { + EXPECT_TRUE(null_bitmap->contains(i)); + } + } + index_input->close(); + _CLLDELETE(index_input); + } catch (const CLuceneError& e) { + EXPECT_TRUE(false) << "CLuceneError: " << e.what(); + } + } + +private: + static void build_slices(vectorized::PaddedPODArray<Slice>& slices, + const vectorized::ColumnPtr& column_array, size_t num_strings) { + const auto* col_arr = assert_cast<const vectorized::ColumnArray*>(column_array.get()); const vectorized::UInt8* nested_null_map = assert_cast<const vectorized::ColumnNullable*>(col_arr->get_data_ptr().get()) - ->get_null_map_data() + ->get_null_map_column() + .get_data() .data(); - auto* col_arr_str = assert_cast<const vectorized::ColumnString*>( + const auto* col_arr_str = assert_cast<const vectorized::ColumnString*>( assert_cast<const vectorized::ColumnNullable*>(col_arr->get_data_ptr().get()) ->get_nested_column_ptr() .get()); const char* char_data = (const char*)(col_arr_str->get_chars().data()); const vectorized::ColumnString::Offset* offset_cur = col_arr_str->get_offsets().data(); - const vectorized::ColumnString::Offset* offset_end = offset_cur + 5; - - Slice* slice = _slice.data(); + const vectorized::ColumnString::Offset* offset_end = offset_cur + num_strings; + Slice* slice = slices.data(); size_t string_offset = *(offset_cur - 1); const vectorized::UInt8* nullmap_cur = nested_null_map; while (offset_cur != offset_end) { @@ -188,21 +1060,6 @@ public: ++slice; ++offset_cur; } - - auto field_size = field->get_sub_field(0)->size(); - Status st = _inverted_index_builder->add_array_values( - field_size, reinterpret_cast<const void*>(_slice.data()), - reinterpret_cast<const uint8_t*>(nested_null_map), offsets_ptr, 2); - EXPECT_EQ(st, Status::OK()); - EXPECT_EQ(_inverted_index_builder->finish(), Status::OK()); - EXPECT_EQ(index_file_writer->close(), Status::OK()); - - { - std::cout << "dir: " << file_writer->path().parent_path().string() << std::endl; - string idx_file_name = file_writer->path().filename().string() + "_26033.idx"; - std::cout << "file: " << file_writer->path().filename().string() << std::endl; - check_terms_stats(file_writer->path().parent_path().string(), idx_file_name); - } } }; @@ -217,9 +1074,55 @@ TEST_F(InvertedIndexArrayTest, ArrayString) { arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); arrayTabletColumn.add_sub_column(arraySubColumn); Field* field = FieldFactory::create(arrayTabletColumn); - test_string("InvertedIndexArray", field); + test_string(0, 0, field); + test_non_null_string(1, 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, ComplexNullCases) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_null_write(2, 0, field); + test_null_write_v2(3, 0, field); + test_array_all_null(4, 0, field); delete field; } -} // namespace segment_v2 -} // namespace doris +TEST_F(InvertedIndexArrayTest, MultiBlockWrite) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_string"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_STRING); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_multi_block_write(5, 0, field); + delete field; +} + +TEST_F(InvertedIndexArrayTest, ArrayInt) { + TabletColumn arrayTabletColumn; + arrayTabletColumn.set_unique_id(0); + arrayTabletColumn.set_name("arr1"); + arrayTabletColumn.set_type(FieldType::OLAP_FIELD_TYPE_ARRAY); + TabletColumn arraySubColumn; + arraySubColumn.set_unique_id(1); + arraySubColumn.set_name("arr_sub_int"); + arraySubColumn.set_type(FieldType::OLAP_FIELD_TYPE_INT); + arrayTabletColumn.add_sub_column(arraySubColumn); + Field* field = FieldFactory::create(arrayTabletColumn); + test_array_numeric(6, 0, field); + delete field; +} +} // namespace doris::segment_v2 diff --git a/regression-test/data/inverted_index_p0/test_add_index_for_arr.out b/regression-test/data/inverted_index_p0/test_add_index_for_arr.out new file mode 100644 index 00000000000..9bb146c0df5 Binary files /dev/null and b/regression-test/data/inverted_index_p0/test_add_index_for_arr.out differ diff --git a/regression-test/suites/inverted_index_p0/test_add_index_for_arr.groovy b/regression-test/suites/inverted_index_p0/test_add_index_for_arr.groovy index 6f3e772dd08..78bec2d11b0 100644 --- a/regression-test/suites/inverted_index_p0/test_add_index_for_arr.groovy +++ b/regression-test/suites/inverted_index_p0/test_add_index_for_arr.groovy @@ -106,9 +106,9 @@ suite("test_add_index_for_arr") { // query without inverted index // query rows with array_contains - def sql_query_name1 = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7')" + def sql_query_name1 = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7') order by id" // query rows with !array_contains - def sql_query_name2 = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7')" + def sql_query_name2 = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7') order by id" // add index for name sql "ALTER TABLE my_test_array ADD INDEX name_idx (name) USING INVERTED;" @@ -122,9 +122,9 @@ suite("test_add_index_for_arr") { // query with inverted index sql "set enable_inverted_index_query=true" // query rows with array_contains - def sql_query_name1_inverted = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7')" + def sql_query_name1_inverted = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7') order by id" // query rows with !array_contains - def sql_query_name2_inverted = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7')" + def sql_query_name2_inverted = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7') order by id" // check result for query without inverted index and with inverted index def size1 = sql_query_name1.size(); @@ -147,9 +147,38 @@ suite("test_add_index_for_arr") { sql "drop index name_idx on my_test_array" wait_for_latest_op_on_table_finish("my_test_array", timeout) - def sql_query_name1_without_inverted = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7')" - def sql_query_name2_without_inverted = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7')" + def sql_query_name1_without_inverted = sql "select id, name[1], description[1] from my_test_array where array_contains(name,'text7') order by id" + def sql_query_name2_without_inverted = sql "select id, name[1], description[1] from my_test_array where !array_contains(name,'text7') order by id" assertEquals(sql_query_name1.size(), sql_query_name1_without_inverted.size()) assertEquals(sql_query_name2.size(), sql_query_name2_without_inverted.size()) -} + + def table_name = "test_add_index_for_arr_all_null" + sql "DROP TABLE IF EXISTS ${table_name}" + sql """ + CREATE TABLE IF NOT EXISTS ${table_name} ( + `id` int(11) NULL, + `name` ARRAY<text> NULL, + ) + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 1 + properties("replication_num" = "1"); + """ + + sql "insert into ${table_name} values (1, null), (2, null)" + sql "ALTER TABLE ${table_name} ADD INDEX name_idx (name) USING INVERTED;" + wait_for_latest_op_on_table_finish("${table_name}", timeout) + // build index for name that name data can using inverted index + if (!isCloudMode()) { + sql "BUILD INDEX name_idx ON ${table_name}" + wait_for_build_index_on_partition_finish("${table_name}", timeout) + } + + qt_sql "select /*+SET_VAR(enable_inverted_index_query=true)*/ * from ${table_name} where array_contains(name, 'text7') order by id" + qt_sql "select /*+SET_VAR(enable_inverted_index_query=true)*/ * from ${table_name} where !array_contains(name, 'text7') order by id" + qt_sql "select /*+SET_VAR(enable_inverted_index_query=true)*/ * from ${table_name} where name is null order by id" + + qt_sql "select /*+SET_VAR(enable_inverted_index_query=false)*/ * from ${table_name} where array_contains(name, 'text7') order by id" + qt_sql "select /*+SET_VAR(enable_inverted_index_query=false)*/ * from ${table_name} where !array_contains(name, 'text7') order by id" + qt_sql "select /*+SET_VAR(enable_inverted_index_query=false)*/ * from ${table_name} where name is null order by id" +} \ No newline at end of file --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org