This is an automated email from the ASF dual-hosted git repository. eldenmoon pushed a commit to branch variant-sparse in repository https://gitbox.apache.org/repos/asf/doris.git
commit f75940ae249fc3ee9d3bb1f93c633f44f8d07313 Author: Sun Chenyang <suncheny...@selectdb.com> AuthorDate: Fri Feb 28 11:14:00 2025 +0800 [enhance](variant) add bf (#48405) --- be/src/olap/rowset/segment_v2/segment_writer.cpp | 2 +- .../segment_v2/variant_column_writer_impl.cpp | 11 ++- .../rowset/segment_v2/variant_column_writer_impl.h | 1 - .../rowset/segment_v2/vertical_segment_writer.cpp | 2 +- be/src/vec/common/schema_util.cpp | 20 ++-- be/src/vec/common/schema_util.h | 2 +- .../apache/doris/alter/SchemaChangeHandler.java | 6 ++ .../data/variant_p0/update/inverted_index/load.out | Bin 375 -> 879 bytes .../variant_p0/update/inverted_index/query.out | Bin 655 -> 887 bytes .../test_variant_bloom_filter.groovy | 7 +- .../variant_p0/update/inverted_index/load.groovy | 59 +++++++++--- .../variant_p0/update/inverted_index/query.groovy | 105 ++++++++++++++++----- 12 files changed, 159 insertions(+), 56 deletions(-) diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index e572cf77279..b485623098b 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -412,7 +412,7 @@ Status SegmentWriter::append_block_with_variant_subcolumns(vectorized::Block& da int current_column_id = column_id++; TabletColumn tablet_column = generate_column_info(entry); vectorized::schema_util::inherit_column_attributes(*parent_column, tablet_column, - _flush_schema); + &_flush_schema); RETURN_IF_ERROR(_create_column_writer(current_column_id /*unused*/, tablet_column, _flush_schema)); RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp index 344273fdf31..2f19309520a 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.cpp @@ -231,7 +231,8 @@ Status VariantColumnWriterImpl::_process_subcolumns(vectorized::ColumnObject* pt CHECK(entry->data.is_finalized()); int current_column_id = column_id++; TabletColumn tablet_column = generate_column_info(entry); - RETURN_IF_ERROR(_create_column_writer(current_column_id, tablet_column, *_tablet_column, + vectorized::schema_util::inherit_column_attributes(*_tablet_column, tablet_column); + RETURN_IF_ERROR(_create_column_writer(current_column_id, tablet_column, _opts.rowset_ctx->tablet_schema)); converter->add_column_data_convertor(tablet_column); RETURN_IF_ERROR(converter->set_source_content_with_specifid_column( @@ -510,7 +511,6 @@ void VariantColumnWriterImpl::_init_column_meta(ColumnMetaPB* meta, uint32_t col }; Status VariantColumnWriterImpl::_create_column_writer(uint32_t cid, const TabletColumn& column, - const TabletColumn& parent_column, const TabletSchemaSPtr& tablet_schema) { ColumnWriterOptions opts; opts.meta = _opts.footer->add_columns(); @@ -518,7 +518,8 @@ Status VariantColumnWriterImpl::_create_column_writer(uint32_t cid, const Tablet _init_column_meta(opts.meta, cid, column); opts.need_zone_map = tablet_schema->keys_type() != KeysType::AGG_KEYS; - opts.need_bloom_filter = parent_column.is_bf_column(); + opts.need_bloom_filter = column.is_bf_column(); + // const auto* tablet_index = tablet_schema->get_ngram_bf_index(parent_column.unique_id()); // if (tablet_index) { // opts.need_bloom_filter = true; @@ -538,8 +539,8 @@ Status VariantColumnWriterImpl::_create_column_writer(uint32_t cid, const Tablet // opts.gram_bf_size = gram_bf_size; // } - opts.need_bitmap_index = parent_column.has_bitmap_index(); - const auto& index = tablet_schema->inverted_index(parent_column.unique_id()); + opts.need_bitmap_index = column.has_bitmap_index(); + const auto& index = tablet_schema->inverted_index(column.parent_unique_id()); if (index != nullptr && segment_v2::InvertedIndexColumnWriter::check_support_inverted_index(column)) { auto subcolumn_index = std::make_unique<TabletIndex>(*index); diff --git a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h index fe00999dc38..5835868c33f 100644 --- a/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h +++ b/be/src/olap/rowset/segment_v2/variant_column_writer_impl.h @@ -72,7 +72,6 @@ private: Status _get_subcolumn_paths_from_stats(std::set<std::string>& paths); Status _create_column_writer(uint32_t cid, const TabletColumn& column, - const TabletColumn& parent_column, const TabletSchemaSPtr& tablet_schema); Status _process_root_column(vectorized::ColumnObject* ptr, vectorized::OlapBlockDataConvertor* converter, size_t num_rows, diff --git a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp index 19539116b94..ce1076ae4cc 100644 --- a/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/vertical_segment_writer.cpp @@ -1036,7 +1036,7 @@ Status VerticalSegmentWriter::_append_block_with_variant_subcolumns(RowsInBlock& int current_column_id = column_id++; TabletColumn tablet_column = generate_column_info(entry); vectorized::schema_util::inherit_column_attributes(*parent_column, tablet_column, - _flush_schema); + &_flush_schema); RETURN_IF_ERROR(_create_column_writer(current_column_id /*unused*/, tablet_column, _flush_schema)); RETURN_IF_ERROR(_olap_data_convertor->set_source_content_with_specifid_column( diff --git a/be/src/vec/common/schema_util.cpp b/be/src/vec/common/schema_util.cpp index 2b205bbdb2a..fe2f664b397 100644 --- a/be/src/vec/common/schema_util.cpp +++ b/be/src/vec/common/schema_util.cpp @@ -366,7 +366,7 @@ void update_least_sparse_column(const std::vector<TabletSchemaSPtr>& schemas, } void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, - TabletSchemaSPtr& target_schema) { + TabletSchemaSPtr* target_schema) { DCHECK(target.is_extracted_column()); target.set_aggregation_method(source.aggregation()); @@ -379,19 +379,25 @@ void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, target.set_is_bf_column(source.is_bf_column()); } + if (!target_schema) { + return; + } + // 2. inverted index - const auto* source_index_meta = target_schema->inverted_index(source.unique_id()); + const auto* source_index_meta = (*target_schema)->inverted_index(source.unique_id()); if (source_index_meta != nullptr) { // add index meta TabletIndex index_info = *source_index_meta; index_info.set_escaped_escaped_index_suffix_path(target.path_info_ptr()->get_path()); - const auto* target_index_meta = target_schema->inverted_index( - target.parent_unique_id(), target.path_info_ptr()->get_path()); + const auto* target_index_meta = + (*target_schema) + ->inverted_index(target.parent_unique_id(), + target.path_info_ptr()->get_path()); if (target_index_meta != nullptr) { // already exist - target_schema->update_index(target, IndexType::INVERTED, std::move(index_info)); + (*target_schema)->update_index(target, IndexType::INVERTED, std::move(index_info)); } else { - target_schema->append_index(std::move(index_info)); + (*target_schema)->append_index(std::move(index_info)); } } @@ -409,7 +415,7 @@ void inherit_column_attributes(TabletSchemaSPtr& schema) { // parent column is missing, maybe dropped continue; } - inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, schema); + inherit_column_attributes(schema->column_by_uid(col.parent_unique_id()), col, &schema); } } diff --git a/be/src/vec/common/schema_util.h b/be/src/vec/common/schema_util.h index 795c700e636..4a916618da6 100644 --- a/be/src/vec/common/schema_util.h +++ b/be/src/vec/common/schema_util.h @@ -113,7 +113,7 @@ void inherit_column_attributes(TabletSchemaSPtr& schema); // source: variant column // target: extracted column from variant column void inherit_column_attributes(const TabletColumn& source, TabletColumn& target, - TabletSchemaSPtr& target_schema); + TabletSchemaSPtr* target_schema = nullptr); // get sorted subcolumns of variant vectorized::ColumnObject::Subcolumns get_sorted_subcolumns( diff --git a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index bfe16022472..2d4b883c4e6 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/fe-core/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -677,6 +677,12 @@ public class SchemaChangeHandler extends AlterHandler { modColumn.setName(oriColumn.getName()); modColumn.setUniqueId(oriColumn.getUniqueId()); + Type type = modColumn.getType(); + if (type.isVariantType()) { + ScalarType scType = (ScalarType) type; + scType.setVariantMaxSubcolumnsCount(olapTable.getVariantMaxSubcolumnsCount()); + } + if (!modColumn.equals(oriColumn) && oriColumn.isAutoInc() != modColumn.isAutoInc()) { throw new DdlException("Can't modify the column[" + oriColumn.getName() + "]'s auto-increment attribute."); diff --git a/regression-test/data/variant_p0/update/inverted_index/load.out b/regression-test/data/variant_p0/update/inverted_index/load.out index 771bc652d02..a3c88e7b424 100644 Binary files a/regression-test/data/variant_p0/update/inverted_index/load.out and b/regression-test/data/variant_p0/update/inverted_index/load.out differ diff --git a/regression-test/data/variant_p0/update/inverted_index/query.out b/regression-test/data/variant_p0/update/inverted_index/query.out index d71b7a8eca8..7a5953cf127 100644 Binary files a/regression-test/data/variant_p0/update/inverted_index/query.out and b/regression-test/data/variant_p0/update/inverted_index/query.out differ diff --git a/regression-test/suites/fault_injection_p0/test_variant_bloom_filter.groovy b/regression-test/suites/fault_injection_p0/test_variant_bloom_filter.groovy index 88c529d685d..d7ecf4c5233 100644 --- a/regression-test/suites/fault_injection_p0/test_variant_bloom_filter.groovy +++ b/regression-test/suites/fault_injection_p0/test_variant_bloom_filter.groovy @@ -59,7 +59,7 @@ suite("test_variant_bloom_filter", "nonConcurrent") { ) DUPLICATE KEY(`k`) DISTRIBUTED BY HASH(k) BUCKETS 1 - properties("replication_num" = "1", "disable_auto_compaction" = "false", "bloom_filter_columns" = "v"); + properties("replication_num" = "1", "disable_auto_compaction" = "false", "bloom_filter_columns" = "v", "variant_max_subcolumns_count" = "9999"); """ load_json_data.call(index_table, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""") load_json_data.call(index_table, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""") @@ -72,7 +72,7 @@ suite("test_variant_bloom_filter", "nonConcurrent") { getBackendIpHttpPort(backendId_to_backendIP, backendId_to_backendHttpPort); def tablets = sql_return_maparray """ show tablets from ${index_table}; """ - + def code, out, err, backend_id; for (def tablet in tablets) { int beforeSegmentCount = 0 String tablet_id = tablet.TabletId @@ -100,7 +100,8 @@ suite("test_variant_bloom_filter", "nonConcurrent") { // wait for all compactions done for (def tablet in tablets) { - Awaitility.await().atMost(3, TimeUnit.MINUTES).untilAsserted(() -> { + Awaitility.await().atMost(10, TimeUnit.MINUTES).untilAsserted(() -> { + Thread.sleep(5000) String tablet_id = tablet.TabletId backend_id = tablet.BackendId (code, out, err) = be_get_compaction_status(backendId_to_backendIP.get(backend_id), backendId_to_backendHttpPort.get(backend_id), tablet_id) diff --git a/regression-test/suites/variant_p0/update/inverted_index/load.groovy b/regression-test/suites/variant_p0/update/inverted_index/load.groovy index bc7a93e8f9a..79f602d2a16 100644 --- a/regression-test/suites/variant_p0/update/inverted_index/load.groovy +++ b/regression-test/suites/variant_p0/update/inverted_index/load.groovy @@ -45,8 +45,9 @@ suite("update_test_index_load", "p0") { } } - - def create_table_load_data = {create_table_name-> + + def create_table_load_data = {create_table_name, format-> + sql """ set disable_inverted_index_v1_for_variant = false """ sql "DROP TABLE IF EXISTS ${create_table_name}" sql """ CREATE TABLE IF NOT EXISTS ${create_table_name} ( @@ -55,23 +56,51 @@ suite("update_test_index_load", "p0") { INDEX idx(v) USING INVERTED PROPERTIES("parser"="standard") ) DUPLICATE KEY(`k`) - DISTRIBUTED BY HASH(k) BUCKETS 6 - properties("replication_num" = "1", "disable_auto_compaction" = "true", "variant_max_subcolumns_count" = "0"); + DISTRIBUTED BY HASH(k) BUCKETS 10 + properties( + "replication_num" = "1", + "disable_auto_compaction" = "true", + "bloom_filter_columns" = "v", + "inverted_index_storage_format" = ${format} + ); """ - for (int i = 0; i < 10; i++) { + for (int i = 0; i < 3; i++) { load_json_data.call(create_table_name, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") } - sql """set enable_match_without_inverted_index = false""" - sql """ set inverted_index_skip_threshold = 0 """ - qt_sql """ select count() from ${create_table_name} """ - qt_sql """ select count() from ${create_table_name} where cast (v['repo']['name'] as string) match 'github'""" - qt_sql """ select count() from ${create_table_name} where cast (v['actor']['id'] as int) > 1575592 """ - qt_sql """ select count() from ${create_table_name} where cast (v['actor']['id'] as int) > 1575592 and cast (v['repo']['name'] as string) match 'github'""" + try { + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + sql """set enable_match_without_inverted_index = false""" + sql """ set inverted_index_skip_threshold = 0 """ + sql """ set enable_inverted_index_query = true """ + qt_sql """ select count() from ${create_table_name} """ + qt_sql """ select count() from ${create_table_name} where cast (v['repo']['name'] as string) match 'github'""" + qt_sql """ select count() from ${create_table_name} where cast (v['actor']['id'] as int) > 1575592 """ + qt_sql """ select count() from ${create_table_name} where cast (v['actor']['id'] as int) > 1575592 and cast (v['repo']['name'] as string) match 'github'""" + } finally { + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + } + + + try { + GetDebugPoint().enableDebugPointForAllBEs("bloom_filter_must_filter_data") + sql """ set enable_inverted_index_query = false """ + // number + qt_sql1 """ select count() from ${create_table_name} where cast(v['repo']['id'] as int) in (0, 1, 2, 3, 4, 5); """ + + // string + qt_sql2 """ select count() from ${create_table_name} where cast(v['repo']['name'] as text) = "xxxx"; """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("bloom_filter_must_filter_data") + } } - create_table_load_data.call("test_update_index_sc") - create_table_load_data.call("test_update_index_sc2") - create_table_load_data.call("test_update_index_compact") - create_table_load_data.call("test_update_index_compact2") + create_table_load_data.call("test_update_index_sc_v1", "V1") + create_table_load_data.call("test_update_index_sc_v2", "V2") + create_table_load_data.call("test_update_index_sc2_v1", "V1") + create_table_load_data.call("test_update_index_sc2_v2", "V2") + create_table_load_data.call("test_update_index_compact_v1", "V1") + create_table_load_data.call("test_update_index_compact_v2", "V2") + create_table_load_data.call("test_update_index_compact2_v1", "V1") + create_table_load_data.call("test_update_index_compact2_v2", "V2") } diff --git a/regression-test/suites/variant_p0/update/inverted_index/query.groovy b/regression-test/suites/variant_p0/update/inverted_index/query.groovy index a156a560fcd..d5bdfcc5f72 100644 --- a/regression-test/suites/variant_p0/update/inverted_index/query.groovy +++ b/regression-test/suites/variant_p0/update/inverted_index/query.groovy @@ -101,21 +101,62 @@ suite("update_test_index_query", "p0") { } def normal_check = {check_table_name-> - sql """set enable_match_without_inverted_index = false """ - sql """ set inverted_index_skip_threshold = 0 """ - qt_sql """ select count() from ${check_table_name} """ - qt_sql """ select count() from ${check_table_name} where cast (v['repo']['name'] as string) match 'github'""" - qt_sql """ select count() from ${check_table_name} where cast (v['actor']['id'] as int) > 1575592 """ - qt_sql """ select count() from ${check_table_name} where cast (v['actor']['id'] as int) > 1575592 and cast (v['repo']['name'] as string) match 'github'""" + try { + GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + sql """set enable_match_without_inverted_index = false""" + sql """ set inverted_index_skip_threshold = 0 """ + sql """ set enable_inverted_index_query = true """ + qt_sql """ select count() from ${check_table_name} """ + qt_sql """ select count() from ${check_table_name} where cast (v['repo']['name'] as string) match 'github'""" + qt_sql """ select count() from ${check_table_name} where cast (v['actor']['id'] as int) > 1575592 """ + qt_sql """ select count() from ${check_table_name} where cast (v['actor']['id'] as int) > 1575592 and cast (v['repo']['name'] as string) match 'github'""" + } finally { + GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + } + + try { + GetDebugPoint().enableDebugPointForAllBEs("bloom_filter_must_filter_data") + sql """ set enable_inverted_index_query = false """ + // number + qt_sql1 """ select count() from ${check_table_name} where cast(v['repo']['id'] as int) in (0, 1, 2, 3, 4, 5); """ + + // string + qt_sql2 """ select count() from ${check_table_name} where cast(v['repo']['name'] as text) in ("aaaaaa"); """ + } finally { + GetDebugPoint().disableDebugPointForAllBEs("bloom_filter_must_filter_data") + } + } + + + + + + def table_name = "test_update_index_compact_v1" + + for (int i = 0; i < 3; i++) { + load_json_data.call(table_name, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") } - def table_name = "test_update_index_compact" + normal_check.call("test_update_index_compact_v1") + + compaction.call("test_update_index_compact_v1") - for (int i = 0; i < 2; i++) { + normal_check.call("test_update_index_compact_v1") + + table_name = "test_update_index_compact_v2" + + for (int i = 0; i < 3; i++) { load_json_data.call(table_name, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") } - GetDebugPoint().enableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + normal_check.call("test_update_index_compact_v2") + + compaction.call("test_update_index_compact_v2") + + normal_check.call("test_update_index_compact_v2") + + + table_name = "test_update_index_compact2_v1" normal_check.call(table_name) @@ -123,7 +164,7 @@ suite("update_test_index_query", "p0") { normal_check.call(table_name) - table_name = "test_update_index_compact2" + table_name = "test_update_index_compact2_v2" normal_check.call(table_name) @@ -177,25 +218,45 @@ suite("update_test_index_query", "p0") { } - def table_name_sc = "test_update_index_sc" + // def table_name_sc = "test_update_index_sc_v1" - for (int i = 0; i < 2; i++) { - load_json_data.call(table_name_sc, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") - } + // for (int i = 0; i < 3; i++) { + // load_json_data.call(table_name_sc, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") + // } + + // normal_check.call(table_name_sc) + + // schema_change.call(table_name_sc) + + // normal_check.call(table_name_sc) + + // table_name_sc = "test_update_index_sc_v2" + + // for (int i = 0; i < 3; i++) { + // load_json_data.call(table_name_sc, """${getS3Url() + '/regression/load/ghdata_sample.json'}""") + // } + + // normal_check.call(table_name_sc) + + // schema_change.call(table_name_sc) + + // normal_check.call(table_name_sc) + + // table_name_sc = "test_update_index_sc2_v1" - normal_check.call(table_name_sc) + // normal_check.call(table_name_sc) - schema_change.call(table_name_sc) + // schema_change.call(table_name_sc) - normal_check.call(table_name_sc) + // normal_check.call(table_name_sc) - table_name_sc = "test_update_index_sc2" + // table_name_sc = "test_update_index_sc2_v2" - normal_check.call(table_name_sc) + // normal_check.call(table_name_sc) - schema_change.call(table_name_sc) + // schema_change.call(table_name_sc) - normal_check.call(table_name_sc) + // normal_check.call(table_name_sc) - GetDebugPoint().disableDebugPointForAllBEs("segment_iterator.apply_inverted_index") + } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org