github-actions[bot] commented on code in PR #45287: URL: https://github.com/apache/doris/pull/45287#discussion_r1889694522
########## be/test/olap/segments_key_bounds_truncation_test.cpp: ########## @@ -0,0 +1,777 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gen_cpp/olap_file.pb.h> +#include <gen_cpp/segment_v2.pb.h> +#include <gtest/gtest.h> + +#include <algorithm> +#include <memory> +#include <random> +#include <string> +#include <vector> + +#include "common/config.h" +#include "io/fs/local_file_system.h" +#include "olap/cumulative_compaction.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/segment.h" +#include "olap/rowset/segment_v2/segment_writer.h" +#include "olap/storage_engine.h" +#include "olap/tablet_meta.h" +#include "olap/tablet_reader.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "util/key_util.h" +#include "vec/olap/block_reader.h" + +namespace doris { +static std::string kSegmentDir = "./ut_dir/segments_key_bounds_truncation_test"; + +class SegmentsKeyBoundsTruncationTest : public testing::Test { +private: + StorageEngine* engine_ref = nullptr; + string absolute_dir; + std::unique_ptr<DataDir> data_dir; + int cur_version {2}; + +public: + void SetUp() override { + auto st = io::global_local_filesystem()->delete_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + engine_ref = engine.get(); + data_dir = std::make_unique<DataDir>(*engine_ref, kSegmentDir); + ASSERT_TRUE(data_dir->update_capacity().ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kSegmentDir).ok()); + engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + TabletSchemaSPtr create_schema(int varchar_length) { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(DUP_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(4); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("k1"); + column_1->set_type("VARCHAR"); + column_1->set_is_key(true); + column_1->set_length(varchar_length); + column_1->set_index_length(36); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c1"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_nullable(true); + column_2->set_is_key(false); + column_2->set_is_nullable(true); + column_2->set_is_bf_column(false); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + TabletSharedPtr create_tablet(const TabletSchema& tablet_schema, + bool enable_unique_key_merge_on_write) { + std::vector<TColumn> cols; + std::unordered_map<uint32_t, uint32_t> col_ordinal_to_unique_id; + for (auto i = 0; i < tablet_schema.num_columns(); i++) { + const TabletColumn& column = tablet_schema.column(i); + TColumn col; + col.column_type.type = TPrimitiveType::INT; + col.__set_column_name(column.name()); + col.__set_is_key(column.is_key()); + cols.push_back(col); + col_ordinal_to_unique_id[i] = column.unique_id(); + } + + TTabletSchema t_tablet_schema; + t_tablet_schema.__set_short_key_column_count(tablet_schema.num_short_key_columns()); + t_tablet_schema.__set_schema_hash(3333); + if (tablet_schema.keys_type() == UNIQUE_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::UNIQUE_KEYS); + } else if (tablet_schema.keys_type() == DUP_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::DUP_KEYS); + } else if (tablet_schema.keys_type() == AGG_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::AGG_KEYS); + } + t_tablet_schema.__set_storage_type(TStorageType::COLUMN); + t_tablet_schema.__set_columns(cols); + TabletMetaSharedPtr tablet_meta {std::make_shared<TabletMeta>( + 2, 2, 2, 2, 2, 2, t_tablet_schema, 2, col_ordinal_to_unique_id, UniqueId(1, 2), + TTabletType::TABLET_TYPE_DISK, TCompressionType::LZ4F, 0, + enable_unique_key_merge_on_write)}; + + TabletSharedPtr tablet {std::make_shared<Tablet>(*engine_ref, tablet_meta, data_dir.get())}; + EXPECT_TRUE(tablet->init().ok()); + return tablet; + } + + RowsetWriterContext create_rowset_writer_context(TabletSchemaSPtr tablet_schema, + const SegmentsOverlapPB& overlap, + uint32_t max_rows_per_segment, + Version version) { + RowsetWriterContext rowset_writer_context; + rowset_writer_context.rowset_id = engine_ref->next_rowset_id(); + rowset_writer_context.rowset_type = BETA_ROWSET; + rowset_writer_context.rowset_state = VISIBLE; + rowset_writer_context.tablet_schema = tablet_schema; + rowset_writer_context.tablet_path = kSegmentDir; + rowset_writer_context.version = version; + rowset_writer_context.segments_overlap = overlap; + rowset_writer_context.max_rows_per_segment = max_rows_per_segment; + return rowset_writer_context; + } + + void create_and_init_rowset_reader(Rowset* rowset, RowsetReaderContext& context, + RowsetReaderSharedPtr* result) { + auto s = rowset->create_reader(result); + EXPECT_TRUE(s.ok()); + EXPECT_TRUE(*result != nullptr); + + s = (*result)->init(&context); + EXPECT_TRUE(s.ok()); + } + + std::vector<vectorized::Block> generate_blocks( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data) { + std::vector<vectorized::Block> ret; + int const_value = 999; + for (const auto& segment_rows : data) { + vectorized::Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + for (const auto& row : segment_rows) { + columns[0]->insert_data(row.data(), row.size()); + columns[1]->insert_data(reinterpret_cast<const char*>(&const_value), + sizeof(const_value)); + } + ret.emplace_back(std::move(block)); + } + return ret; + } + + std::vector<std::vector<std::string>> get_expected_key_bounds( + const std::vector<std::vector<std::string>>& data) { + std::vector<std::vector<std::string>> ret; + for (const auto& rows : data) { + auto& cur = ret.emplace_back(); + auto min_key = rows.front(); + auto max_key = rows.front(); + for (const auto& row : rows) { + if (row < min_key) { + min_key = row; + } + if (row > max_key) { + max_key = row; + } + } + + // segments key bounds have marker + min_key = std::string {KEY_NORMAL_MARKER} + min_key; + max_key = std::string {KEY_NORMAL_MARKER} + max_key; + + cur.emplace_back(do_trunacte(min_key)); + cur.emplace_back(do_trunacte(max_key)); + } + return ret; + } + + RowsetSharedPtr create_rowset(TabletSchemaSPtr tablet_schema, SegmentsOverlapPB overlap, + const std::vector<vectorized::Block> blocks, int64_t version, + bool is_vertical) { + auto writer_context = create_rowset_writer_context(tablet_schema, overlap, UINT32_MAX, + {version, version}); + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, is_vertical); + EXPECT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + + uint32_t num_rows = 0; + for (const auto& block : blocks) { + num_rows += block.rows(); + EXPECT_TRUE(rowset_writer->add_block(&block).ok()); + EXPECT_TRUE(rowset_writer->flush().ok()); + } + + RowsetSharedPtr rowset; + EXPECT_EQ(Status::OK(), rowset_writer->build(rowset)); + EXPECT_EQ(blocks.size(), rowset->rowset_meta()->num_segments()); + EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows()); + return rowset; + } + + std::string do_trunacte(std::string key) { + if (segments_key_bounds_truncation_enabled()) { + auto threshold = config::segments_key_bounds_truncation_threshold; + if (key.size() > threshold) { + key.resize(threshold); + } + } + return key; + } + + bool segments_key_bounds_truncation_enabled() { + return (config::enable_segments_key_bounds_truncation && + config::segments_key_bounds_truncation_threshold > 0); + } + + void check_key_bounds(const std::vector<std::vector<std::string>>& data, + const std::vector<KeyBoundsPB>& segments_key_bounds) { + // 1. check size + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + if (segments_key_bounds_truncation_enabled()) { + EXPECT_LE(min_key.size(), config::segments_key_bounds_truncation_threshold); + EXPECT_LE(max_key.size(), config::segments_key_bounds_truncation_threshold); + } + } + + // 2. check content + auto expected_key_bounds = get_expected_key_bounds(data); + for (std::size_t i = 0; i < expected_key_bounds.size(); i++) { + const auto& min_key = segments_key_bounds[i].min_key(); + const auto& max_key = segments_key_bounds[i].max_key(); + + EXPECT_EQ(min_key, expected_key_bounds[i][0]); + EXPECT_EQ(max_key, expected_key_bounds[i][1]); + std::cout << fmt::format("min_key={}, size={}\nmax_key={}, size={}\n", + hexdump(min_key.data(), min_key.size()), min_key.size(), + hexdump(max_key.data(), max_key.size()), max_key.size()); + } + } + + std::vector<RowsetSharedPtr> create_rowsets(TabletSchemaSPtr tablet_schema, + const std::vector<std::vector<std::string>>& data, + const std::vector<int64_t>& truncate_lengths = {}) { + std::vector<RowsetSharedPtr> rowsets; + for (size_t i {0}; i < data.size(); i++) { + const auto rows = data[i]; + if (!truncate_lengths.empty()) { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = truncate_lengths[i]; + } + std::vector<std::vector<std::string>> rowset_data {rows}; + auto blocks = generate_blocks(tablet_schema, rowset_data); + RowsetSharedPtr rowset = + create_rowset(tablet_schema, NONOVERLAPPING, blocks, cur_version++, false); + + std::vector<KeyBoundsPB> segments_key_bounds; + rowset->rowset_meta()->get_segments_key_bounds(&segments_key_bounds); + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + LOG(INFO) << fmt::format( + "\n==== rowset_id={}, segment_key_bounds_truncated={} ====\nmin_key={}, " + "size={}\nmax_key={}, size={}\n", + rowset->rowset_id().to_string(), rowset->is_segments_key_bounds_truncated(), + min_key, min_key.size(), max_key, max_key.size()); + } + + rowsets.push_back(rowset); + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader)); + } + for (std::size_t i {0}; i < truncate_lengths.size(); i++) { + EXPECT_EQ((truncate_lengths[i] > 0), rowsets[i]->is_segments_key_bounds_truncated()); + } + return rowsets; + } + + TabletReader::ReaderParams create_reader_params( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data, + const std::vector<int64_t>& truncate_lengths = {}) { + TabletReader::ReaderParams reader_params; + std::vector<RowsetSharedPtr> rowsets = + create_rowsets(tablet_schema, data, truncate_lengths); + std::vector<RowSetSplits> rs_splits; + for (size_t i {0}; i < rowsets.size(); i++) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowsets[i]->create_reader(&rs_reader)); + RowSetSplits rs_split; + rs_split.rs_reader = rs_reader; + rs_splits.emplace_back(rs_split); + } + reader_params.rs_splits = std::move(rs_splits); + return reader_params; + } +}; + +TEST_F(SegmentsKeyBoundsTruncationTest, CompareFuncTest) { + // test `Slice::origin_is_strictly_less_than` + // enumerating all possible combinations + // this test is reduntant, n = 3 is enough + constexpr int n = 8; + std::vector<std::string> datas; + for (int l = 1; l <= n; l++) { + for (int x = 0; x < (1 << l); x++) { + datas.emplace_back(fmt::format("{:0{width}b}", x, fmt::arg("width", l))); + } + } + std::cout << "datas.size()=" << datas.size() << "\n"; + + int count1 {0}, count2 {0}, total {0}; + for (size_t i = 0; i < datas.size(); i++) { + for (size_t j = 0; j < datas.size(); j++) { + Slice X {datas[i]}; + Slice Y {datas[j]}; + for (int l1 = 0; l1 <= n; l1++) { + bool X_is_truncated = (l1 != 0); + Slice a {X}; + if (X_is_truncated && X.get_size() >= l1) { + a.truncate(l1); + } + for (int l2 = 0; l2 <= n; l2++) { + bool Y_is_truncated = (l2 != 0); + Slice b {Y}; + if (Y_is_truncated && Y.get_size() >= l2) { + b.truncate(l2); + } + + bool res1 = Slice::origin_is_strictly_less_than(a, X_is_truncated, b, + Y_is_truncated); + bool res2 = (X.compare(Y) < 0); + ++total; + if (res1 && res2) { + ++count1; + } + if (res2) { + ++count2; + } + EXPECT_FALSE(res1 && !res2) << fmt::format( + "X={}, a={}, l1={}, Y={}, b={}, l2={}, res1={}, res2={}", X.to_string(), + a.to_string(), l1, Y.to_string(), b.to_string(), l2, res1, res2); + } + } + } + } + std::cout << fmt::format("count1={}, count2={}, count1/count2={}, total={}\n", count1, count2, + double(count1) / count2, total); +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BasicTruncationTest) { + { + // 1. don't do segments key bounds truncation when the config is off + config::enable_segments_key_bounds_truncation = false; + config::segments_key_bounds_truncation_threshold = 36; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(false, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } + + { + // 2. do segments key bounds truncation when the config is on + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 10; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(true, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { Review Comment: warning: function 'TEST_F' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { ^ ``` <details> <summary>Additional context</summary> **be/test/olap/segments_key_bounds_truncation_test.cpp:428:** 209 lines including whitespace and comments (threshold 80) ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { ^ ``` </details> ########## be/test/olap/segments_key_bounds_truncation_test.cpp: ########## @@ -0,0 +1,777 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gen_cpp/olap_file.pb.h> +#include <gen_cpp/segment_v2.pb.h> +#include <gtest/gtest.h> + +#include <algorithm> +#include <memory> +#include <random> +#include <string> +#include <vector> + +#include "common/config.h" +#include "io/fs/local_file_system.h" +#include "olap/cumulative_compaction.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/segment.h" +#include "olap/rowset/segment_v2/segment_writer.h" +#include "olap/storage_engine.h" +#include "olap/tablet_meta.h" +#include "olap/tablet_reader.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "util/key_util.h" +#include "vec/olap/block_reader.h" + +namespace doris { +static std::string kSegmentDir = "./ut_dir/segments_key_bounds_truncation_test"; + +class SegmentsKeyBoundsTruncationTest : public testing::Test { +private: + StorageEngine* engine_ref = nullptr; + string absolute_dir; + std::unique_ptr<DataDir> data_dir; + int cur_version {2}; + +public: + void SetUp() override { + auto st = io::global_local_filesystem()->delete_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + engine_ref = engine.get(); + data_dir = std::make_unique<DataDir>(*engine_ref, kSegmentDir); + ASSERT_TRUE(data_dir->update_capacity().ok()); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kSegmentDir).ok()); + engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + TabletSchemaSPtr create_schema(int varchar_length) { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(DUP_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(4); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("k1"); + column_1->set_type("VARCHAR"); + column_1->set_is_key(true); + column_1->set_length(varchar_length); + column_1->set_index_length(36); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c1"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_nullable(true); + column_2->set_is_key(false); + column_2->set_is_nullable(true); + column_2->set_is_bf_column(false); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + TabletSharedPtr create_tablet(const TabletSchema& tablet_schema, + bool enable_unique_key_merge_on_write) { + std::vector<TColumn> cols; + std::unordered_map<uint32_t, uint32_t> col_ordinal_to_unique_id; + for (auto i = 0; i < tablet_schema.num_columns(); i++) { + const TabletColumn& column = tablet_schema.column(i); + TColumn col; + col.column_type.type = TPrimitiveType::INT; + col.__set_column_name(column.name()); + col.__set_is_key(column.is_key()); + cols.push_back(col); + col_ordinal_to_unique_id[i] = column.unique_id(); + } + + TTabletSchema t_tablet_schema; + t_tablet_schema.__set_short_key_column_count(tablet_schema.num_short_key_columns()); + t_tablet_schema.__set_schema_hash(3333); + if (tablet_schema.keys_type() == UNIQUE_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::UNIQUE_KEYS); + } else if (tablet_schema.keys_type() == DUP_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::DUP_KEYS); + } else if (tablet_schema.keys_type() == AGG_KEYS) { + t_tablet_schema.__set_keys_type(TKeysType::AGG_KEYS); + } + t_tablet_schema.__set_storage_type(TStorageType::COLUMN); + t_tablet_schema.__set_columns(cols); + TabletMetaSharedPtr tablet_meta {std::make_shared<TabletMeta>( + 2, 2, 2, 2, 2, 2, t_tablet_schema, 2, col_ordinal_to_unique_id, UniqueId(1, 2), + TTabletType::TABLET_TYPE_DISK, TCompressionType::LZ4F, 0, + enable_unique_key_merge_on_write)}; + + TabletSharedPtr tablet {std::make_shared<Tablet>(*engine_ref, tablet_meta, data_dir.get())}; + EXPECT_TRUE(tablet->init().ok()); + return tablet; + } + + RowsetWriterContext create_rowset_writer_context(TabletSchemaSPtr tablet_schema, + const SegmentsOverlapPB& overlap, + uint32_t max_rows_per_segment, + Version version) { + RowsetWriterContext rowset_writer_context; + rowset_writer_context.rowset_id = engine_ref->next_rowset_id(); + rowset_writer_context.rowset_type = BETA_ROWSET; + rowset_writer_context.rowset_state = VISIBLE; + rowset_writer_context.tablet_schema = tablet_schema; + rowset_writer_context.tablet_path = kSegmentDir; + rowset_writer_context.version = version; + rowset_writer_context.segments_overlap = overlap; + rowset_writer_context.max_rows_per_segment = max_rows_per_segment; + return rowset_writer_context; + } + + void create_and_init_rowset_reader(Rowset* rowset, RowsetReaderContext& context, + RowsetReaderSharedPtr* result) { + auto s = rowset->create_reader(result); + EXPECT_TRUE(s.ok()); + EXPECT_TRUE(*result != nullptr); + + s = (*result)->init(&context); + EXPECT_TRUE(s.ok()); + } + + std::vector<vectorized::Block> generate_blocks( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data) { + std::vector<vectorized::Block> ret; + int const_value = 999; + for (const auto& segment_rows : data) { + vectorized::Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + for (const auto& row : segment_rows) { + columns[0]->insert_data(row.data(), row.size()); + columns[1]->insert_data(reinterpret_cast<const char*>(&const_value), + sizeof(const_value)); + } + ret.emplace_back(std::move(block)); + } + return ret; + } + + std::vector<std::vector<std::string>> get_expected_key_bounds( + const std::vector<std::vector<std::string>>& data) { + std::vector<std::vector<std::string>> ret; + for (const auto& rows : data) { + auto& cur = ret.emplace_back(); + auto min_key = rows.front(); + auto max_key = rows.front(); + for (const auto& row : rows) { + if (row < min_key) { + min_key = row; + } + if (row > max_key) { + max_key = row; + } + } + + // segments key bounds have marker + min_key = std::string {KEY_NORMAL_MARKER} + min_key; + max_key = std::string {KEY_NORMAL_MARKER} + max_key; + + cur.emplace_back(do_trunacte(min_key)); + cur.emplace_back(do_trunacte(max_key)); + } + return ret; + } + + RowsetSharedPtr create_rowset(TabletSchemaSPtr tablet_schema, SegmentsOverlapPB overlap, + const std::vector<vectorized::Block> blocks, int64_t version, + bool is_vertical) { + auto writer_context = create_rowset_writer_context(tablet_schema, overlap, UINT32_MAX, + {version, version}); + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, is_vertical); + EXPECT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + + uint32_t num_rows = 0; + for (const auto& block : blocks) { + num_rows += block.rows(); + EXPECT_TRUE(rowset_writer->add_block(&block).ok()); + EXPECT_TRUE(rowset_writer->flush().ok()); + } + + RowsetSharedPtr rowset; + EXPECT_EQ(Status::OK(), rowset_writer->build(rowset)); + EXPECT_EQ(blocks.size(), rowset->rowset_meta()->num_segments()); + EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows()); + return rowset; + } + + std::string do_trunacte(std::string key) { + if (segments_key_bounds_truncation_enabled()) { + auto threshold = config::segments_key_bounds_truncation_threshold; + if (key.size() > threshold) { + key.resize(threshold); + } + } + return key; + } + + bool segments_key_bounds_truncation_enabled() { + return (config::enable_segments_key_bounds_truncation && + config::segments_key_bounds_truncation_threshold > 0); + } + + void check_key_bounds(const std::vector<std::vector<std::string>>& data, + const std::vector<KeyBoundsPB>& segments_key_bounds) { + // 1. check size + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + if (segments_key_bounds_truncation_enabled()) { + EXPECT_LE(min_key.size(), config::segments_key_bounds_truncation_threshold); + EXPECT_LE(max_key.size(), config::segments_key_bounds_truncation_threshold); + } + } + + // 2. check content + auto expected_key_bounds = get_expected_key_bounds(data); + for (std::size_t i = 0; i < expected_key_bounds.size(); i++) { + const auto& min_key = segments_key_bounds[i].min_key(); + const auto& max_key = segments_key_bounds[i].max_key(); + + EXPECT_EQ(min_key, expected_key_bounds[i][0]); + EXPECT_EQ(max_key, expected_key_bounds[i][1]); + std::cout << fmt::format("min_key={}, size={}\nmax_key={}, size={}\n", + hexdump(min_key.data(), min_key.size()), min_key.size(), + hexdump(max_key.data(), max_key.size()), max_key.size()); + } + } + + std::vector<RowsetSharedPtr> create_rowsets(TabletSchemaSPtr tablet_schema, + const std::vector<std::vector<std::string>>& data, + const std::vector<int64_t>& truncate_lengths = {}) { + std::vector<RowsetSharedPtr> rowsets; + for (size_t i {0}; i < data.size(); i++) { + const auto rows = data[i]; + if (!truncate_lengths.empty()) { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = truncate_lengths[i]; + } + std::vector<std::vector<std::string>> rowset_data {rows}; + auto blocks = generate_blocks(tablet_schema, rowset_data); + RowsetSharedPtr rowset = + create_rowset(tablet_schema, NONOVERLAPPING, blocks, cur_version++, false); + + std::vector<KeyBoundsPB> segments_key_bounds; + rowset->rowset_meta()->get_segments_key_bounds(&segments_key_bounds); + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + LOG(INFO) << fmt::format( + "\n==== rowset_id={}, segment_key_bounds_truncated={} ====\nmin_key={}, " + "size={}\nmax_key={}, size={}\n", + rowset->rowset_id().to_string(), rowset->is_segments_key_bounds_truncated(), + min_key, min_key.size(), max_key, max_key.size()); + } + + rowsets.push_back(rowset); + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader)); + } + for (std::size_t i {0}; i < truncate_lengths.size(); i++) { + EXPECT_EQ((truncate_lengths[i] > 0), rowsets[i]->is_segments_key_bounds_truncated()); + } + return rowsets; + } + + TabletReader::ReaderParams create_reader_params( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data, + const std::vector<int64_t>& truncate_lengths = {}) { + TabletReader::ReaderParams reader_params; + std::vector<RowsetSharedPtr> rowsets = + create_rowsets(tablet_schema, data, truncate_lengths); + std::vector<RowSetSplits> rs_splits; + for (size_t i {0}; i < rowsets.size(); i++) { + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowsets[i]->create_reader(&rs_reader)); + RowSetSplits rs_split; + rs_split.rs_reader = rs_reader; + rs_splits.emplace_back(rs_split); + } + reader_params.rs_splits = std::move(rs_splits); + return reader_params; + } +}; + +TEST_F(SegmentsKeyBoundsTruncationTest, CompareFuncTest) { + // test `Slice::origin_is_strictly_less_than` + // enumerating all possible combinations + // this test is reduntant, n = 3 is enough + constexpr int n = 8; + std::vector<std::string> datas; + for (int l = 1; l <= n; l++) { + for (int x = 0; x < (1 << l); x++) { + datas.emplace_back(fmt::format("{:0{width}b}", x, fmt::arg("width", l))); + } + } + std::cout << "datas.size()=" << datas.size() << "\n"; + + int count1 {0}, count2 {0}, total {0}; + for (size_t i = 0; i < datas.size(); i++) { + for (size_t j = 0; j < datas.size(); j++) { + Slice X {datas[i]}; + Slice Y {datas[j]}; + for (int l1 = 0; l1 <= n; l1++) { + bool X_is_truncated = (l1 != 0); + Slice a {X}; + if (X_is_truncated && X.get_size() >= l1) { + a.truncate(l1); + } + for (int l2 = 0; l2 <= n; l2++) { + bool Y_is_truncated = (l2 != 0); + Slice b {Y}; + if (Y_is_truncated && Y.get_size() >= l2) { + b.truncate(l2); + } + + bool res1 = Slice::origin_is_strictly_less_than(a, X_is_truncated, b, + Y_is_truncated); + bool res2 = (X.compare(Y) < 0); + ++total; + if (res1 && res2) { + ++count1; + } + if (res2) { + ++count2; + } + EXPECT_FALSE(res1 && !res2) << fmt::format( + "X={}, a={}, l1={}, Y={}, b={}, l2={}, res1={}, res2={}", X.to_string(), + a.to_string(), l1, Y.to_string(), b.to_string(), l2, res1, res2); + } + } + } + } + std::cout << fmt::format("count1={}, count2={}, count1/count2={}, total={}\n", count1, count2, + double(count1) / count2, total); +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BasicTruncationTest) { + { + // 1. don't do segments key bounds truncation when the config is off + config::enable_segments_key_bounds_truncation = false; + config::segments_key_bounds_truncation_threshold = 36; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(false, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } + + { + // 2. do segments key bounds truncation when the config is on + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 10; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(true, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { + auto tablet_schema = create_schema(100); + + { + // all rowsets are truncated to same size + // keys are distinctable from any index + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbb"}, + {"cccccc", "dddddd"}, + {"eeeeeee", "fffffff"}, + {"xxxxxxx", "yyyyyyyy"}}; + { + config::enable_segments_key_bounds_truncation = false; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 3; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + // can still determine that segments are non ascending after truncation + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + } + + { + // all rowsets are truncated to same size + // keys are distinctable from any index before truncation + // some keys are not comparable after truncation + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbb"}, + {"cccccccccccc", "ccdddddddd"}, + {"cceeeeeeee", "fffffff"}, + {"xxxxxxx", "yyyyyyyy"}}; + { + config::enable_segments_key_bounds_truncation = false; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 6; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 3; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + // can not determine wether rowset 2 and rowset 3 are mono ascending + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + } + + { + // all rowsets are truncated to same size + // keys are not mono ascending before truncation + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbb"}, + {"bbbbb", "cccccccc"}, + {"cccccccc", "xxxxxxx"}, + {"xxxxxxx", "yyyyyyyy"}}; + { + config::enable_segments_key_bounds_truncation = false; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 3; + TabletReader::ReaderParams read_params = create_reader_params(tablet_schema, data); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + } + + { + // some rowsets are truncated, some are not + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbccccccc"}, + {"bbbbbbddddddd", "dddddd"}}; + { + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {-1, 9}); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {-1, 4}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {9, -1}); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, -1}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + } + + { + // some rowsets are truncated, some are not, truncated lengths may be different + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 4, -1, 6}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 8, -1, 6}); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, -1, 4, 6}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, -1, 8, 6}); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 8, 4, 6}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 4, 8, 6}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 8, 9, 6}); + vectorized::BlockReader block_reader; + EXPECT_FALSE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + { + std::vector<std::vector<std::string>> data {{"aaaaaaaaa", "bbbbbbbb"}, + {"ccccccccc", "dddddd"}, + {"eeeeeee", "ffffffggggg"}, + {"ffffffhhhhhh", "hhhhhhh"}, + {"iiiiiiii", "jjjjjjjjj"}}; + TabletReader::ReaderParams read_params = + create_reader_params(tablet_schema, data, {4, 5, 3, 4, 6}); + vectorized::BlockReader block_reader; + EXPECT_TRUE(block_reader._rowsets_not_mono_asc_disjoint(read_params)); + } + } +} + +TEST_F(SegmentsKeyBoundsTruncationTest, OrderedCompactionTest) { Review Comment: warning: function 'TEST_F' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, OrderedCompactionTest) { ^ ``` <details> <summary>Additional context</summary> **be/test/olap/segments_key_bounds_truncation_test.cpp:639:** 136 lines including whitespace and comments (threshold 80) ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, OrderedCompactionTest) { ^ ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org