github-actions[bot] commented on code in PR #45287: URL: https://github.com/apache/doris/pull/45287#discussion_r1889560818
########## be/test/olap/segments_key_bounds_truncation_test.cpp: ########## @@ -0,0 +1,588 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gen_cpp/olap_file.pb.h> +#include <gen_cpp/segment_v2.pb.h> +#include <gtest/gtest.h> + +#include <algorithm> +#include <memory> +#include <random> +#include <string> +#include <vector> + +#include "common/config.h" +#include "io/fs/local_file_system.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/segment.h" +#include "olap/rowset/segment_v2/segment_writer.h" +#include "olap/storage_engine.h" +#include "olap/tablet_meta.h" +#include "olap/tablet_reader.h" +#include "olap/tablet_schema.h" +#include "runtime/exec_env.h" +#include "util/key_util.h" +#include "vec/olap/block_reader.h" + +namespace doris { +static std::string kSegmentDir = "./ut_dir/segments_key_bounds_truncation_test"; + +class SegmentsKeyBoundsTruncationTest : public testing::Test { +private: + StorageEngine* engine_ref = nullptr; + +public: + void SetUp() override { + auto st = io::global_local_filesystem()->delete_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + st = io::global_local_filesystem()->create_directory(kSegmentDir); + ASSERT_TRUE(st.ok()) << st; + doris::EngineOptions options; + auto engine = std::make_unique<StorageEngine>(options); + engine_ref = engine.get(); + ExecEnv::GetInstance()->set_storage_engine(std::move(engine)); + } + + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(kSegmentDir).ok()); + engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + + TabletSchemaSPtr create_schema(int varchar_length) { + TabletSchemaSPtr tablet_schema = std::make_shared<TabletSchema>(); + TabletSchemaPB tablet_schema_pb; + tablet_schema_pb.set_keys_type(DUP_KEYS); + tablet_schema_pb.set_num_short_key_columns(1); + tablet_schema_pb.set_num_rows_per_row_block(1024); + tablet_schema_pb.set_compress_kind(COMPRESS_NONE); + tablet_schema_pb.set_next_column_unique_id(4); + + ColumnPB* column_1 = tablet_schema_pb.add_column(); + column_1->set_unique_id(1); + column_1->set_name("k1"); + column_1->set_type("VARCHAR"); + column_1->set_is_key(true); + column_1->set_length(varchar_length); + column_1->set_index_length(36); + column_1->set_is_nullable(false); + column_1->set_is_bf_column(false); + + ColumnPB* column_2 = tablet_schema_pb.add_column(); + column_2->set_unique_id(2); + column_2->set_name("c1"); + column_2->set_type("INT"); + column_2->set_length(4); + column_2->set_index_length(4); + column_2->set_is_nullable(true); + column_2->set_is_key(false); + column_2->set_is_nullable(true); + column_2->set_is_bf_column(false); + + tablet_schema->init_from_pb(tablet_schema_pb); + return tablet_schema; + } + + RowsetWriterContext create_rowset_writer_context(TabletSchemaSPtr tablet_schema, + const SegmentsOverlapPB& overlap, + uint32_t max_rows_per_segment, + Version version) { + static int64_t inc_id = 123000000; + RowsetWriterContext rowset_writer_context; + RowsetId rowset_id; + rowset_id.init(inc_id++); + rowset_writer_context.rowset_id = rowset_id; + rowset_writer_context.rowset_type = BETA_ROWSET; + rowset_writer_context.rowset_state = VISIBLE; + rowset_writer_context.tablet_schema = tablet_schema; + rowset_writer_context.tablet_path = kSegmentDir; + rowset_writer_context.version = version; + rowset_writer_context.segments_overlap = overlap; + rowset_writer_context.max_rows_per_segment = max_rows_per_segment; + return rowset_writer_context; + } + + void create_and_init_rowset_reader(Rowset* rowset, RowsetReaderContext& context, + RowsetReaderSharedPtr* result) { + auto s = rowset->create_reader(result); + EXPECT_TRUE(s.ok()); + EXPECT_TRUE(*result != nullptr); + + s = (*result)->init(&context); + EXPECT_TRUE(s.ok()); + } + + std::vector<vectorized::Block> generate_blocks( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data) { + std::vector<vectorized::Block> ret; + int const_value = 999; + for (const auto& segment_rows : data) { + vectorized::Block block = tablet_schema->create_block(); + auto columns = block.mutate_columns(); + for (const auto& row : segment_rows) { + columns[0]->insert_data(row.data(), row.size()); + columns[1]->insert_data(reinterpret_cast<const char*>(&const_value), + sizeof(const_value)); + } + ret.emplace_back(std::move(block)); + } + return ret; + } + + std::vector<std::vector<std::string>> get_expected_key_bounds( + const std::vector<std::vector<std::string>>& data) { + std::vector<std::vector<std::string>> ret; + for (const auto& rows : data) { + auto& cur = ret.emplace_back(); + auto min_key = rows.front(); + auto max_key = rows.front(); + for (const auto& row : rows) { + if (row < min_key) { + min_key = row; + } + if (row > max_key) { + max_key = row; + } + } + + // segments key bounds have marker + min_key = std::string {KEY_NORMAL_MARKER} + min_key; + max_key = std::string {KEY_NORMAL_MARKER} + max_key; + + cur.emplace_back(do_trunacte(min_key)); + cur.emplace_back(do_trunacte(max_key)); + } + return ret; + } + + RowsetSharedPtr create_rowset(TabletSchemaSPtr tablet_schema, SegmentsOverlapPB overlap, + const std::vector<vectorized::Block> blocks, int64_t version, + bool is_vertical) { + auto writer_context = create_rowset_writer_context(tablet_schema, overlap, UINT32_MAX, + {version, version}); + auto res = RowsetFactory::create_rowset_writer(*engine_ref, writer_context, is_vertical); + EXPECT_TRUE(res.has_value()) << res.error(); + auto rowset_writer = std::move(res).value(); + + uint32_t num_rows = 0; + for (const auto& block : blocks) { + num_rows += block.rows(); + EXPECT_TRUE(rowset_writer->add_block(&block).ok()); + EXPECT_TRUE(rowset_writer->flush().ok()); + } + + RowsetSharedPtr rowset; + EXPECT_EQ(Status::OK(), rowset_writer->build(rowset)); + EXPECT_EQ(blocks.size(), rowset->rowset_meta()->num_segments()); + EXPECT_EQ(num_rows, rowset->rowset_meta()->num_rows()); + return rowset; + } + + std::string do_trunacte(std::string key) { + if (segments_key_bounds_truncation_enabled()) { + auto threshold = config::segments_key_bounds_truncation_threshold; + if (key.size() > threshold) { + key.resize(threshold); + } + } + return key; + } + + bool segments_key_bounds_truncation_enabled() { + return (config::enable_segments_key_bounds_truncation && + config::segments_key_bounds_truncation_threshold > 0); + } + + void check_key_bounds(const std::vector<std::vector<std::string>>& data, + const std::vector<KeyBoundsPB>& segments_key_bounds) { + // 1. check size + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + if (segments_key_bounds_truncation_enabled()) { + EXPECT_LE(min_key.size(), config::segments_key_bounds_truncation_threshold); + EXPECT_LE(max_key.size(), config::segments_key_bounds_truncation_threshold); + } + } + + // 2. check content + auto expected_key_bounds = get_expected_key_bounds(data); + for (std::size_t i = 0; i < expected_key_bounds.size(); i++) { + const auto& min_key = segments_key_bounds[i].min_key(); + const auto& max_key = segments_key_bounds[i].max_key(); + + EXPECT_EQ(min_key, expected_key_bounds[i][0]); + EXPECT_EQ(max_key, expected_key_bounds[i][1]); + std::cout << fmt::format("min_key={}, size={}\nmax_key={}, size={}\n", + hexdump(min_key.data(), min_key.size()), min_key.size(), + hexdump(max_key.data(), max_key.size()), max_key.size()); + } + } + + TabletReader::ReaderParams create_reader_params( + TabletSchemaSPtr tablet_schema, const std::vector<std::vector<std::string>>& data, + const std::vector<int64_t>& truncate_lengths = {}) { + TabletReader::ReaderParams reader_params; + std::vector<RowsetSharedPtr> rowsets; + std::vector<RowSetSplits> rs_splits; + for (size_t i {0}; i < data.size(); i++) { + const auto rows = data[i]; + if (!truncate_lengths.empty()) { + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = truncate_lengths[i]; + } + std::vector<std::vector<std::string>> rowset_data {rows}; + auto blocks = generate_blocks(tablet_schema, rowset_data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 3, false); + + std::vector<KeyBoundsPB> segments_key_bounds; + rowset->rowset_meta()->get_segments_key_bounds(&segments_key_bounds); + for (const auto& segments_key_bound : segments_key_bounds) { + const auto& min_key = segments_key_bound.min_key(); + const auto& max_key = segments_key_bound.max_key(); + + std::cout << fmt::format( + "\n==== rowset_id={}, segment_key_bounds_truncated={} ====\nmin_key={}, " + "size={}\nmax_key={}, size={}\n", + rowset->rowset_id().to_string(), rowset->is_segments_key_bounds_truncated(), + hexdump(min_key.data(), min_key.size()), min_key.size(), + hexdump(max_key.data(), max_key.size()), max_key.size()); + } + + rowsets.push_back(rowset); + RowsetReaderSharedPtr rs_reader; + EXPECT_TRUE(rowset->create_reader(&rs_reader)); + RowSetSplits rs_split; + rs_split.rs_reader = rs_reader; + rs_splits.emplace_back(rs_split); + } + for (std::size_t i {0}; i < truncate_lengths.size(); i++) { + EXPECT_EQ((truncate_lengths[i] > 0), rowsets[i]->is_segments_key_bounds_truncated()); + } + reader_params.rs_splits = std::move(rs_splits); + return reader_params; + } +}; + +TEST_F(SegmentsKeyBoundsTruncationTest, CompareFuncTest) { + // test `Slice::origin_is_strictly_less_than` + // enumerating all possible combinations + // this test is reduntant, n = 3 is enough + constexpr int n = 8; + std::vector<std::string> datas; + for (int l = 1; l <= n; l++) { + for (int x = 0; x < (1 << l); x++) { + datas.emplace_back(fmt::format("{:0{width}b}", x, fmt::arg("width", l))); + } + } + std::cout << "datas.size()=" << datas.size() << "\n"; + + int count1 {0}, count2 {0}, total {0}; + for (size_t i = 0; i < datas.size(); i++) { + for (size_t j = 0; j < datas.size(); j++) { + Slice X {datas[i]}; + Slice Y {datas[j]}; + for (int l1 = 0; l1 <= n; l1++) { + bool X_is_truncated = (l1 != 0); + Slice a {X}; + if (X_is_truncated && X.get_size() >= l1) { + a.truncate(l1); + } + for (int l2 = 0; l2 <= n; l2++) { + bool Y_is_truncated = (l2 != 0); + Slice b {Y}; + if (Y_is_truncated && Y.get_size() >= l2) { + b.truncate(l2); + } + + bool res1 = Slice::origin_is_strictly_less_than(a, X_is_truncated, b, + Y_is_truncated); + bool res2 = (X.compare(Y) < 0); + ++total; + if (res1 && res2) { + ++count1; + } + if (res2) { + ++count2; + } + EXPECT_FALSE(res1 && !res2) << fmt::format( + "X={}, a={}, l1={}, Y={}, b={}, l2={}, res1={}, res2={}", X.to_string(), + a.to_string(), l1, Y.to_string(), b.to_string(), l2, res1, res2); + } + } + } + } + std::cout << fmt::format("count1={}, count2={}, count1/count2={}, total={}\n", count1, count2, + double(count1) / count2, total); +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BasicTruncationTest) { + { + // 1. don't do segments key bounds truncation when the config is off + config::enable_segments_key_bounds_truncation = false; + config::segments_key_bounds_truncation_threshold = 36; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(false, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } + + { + // 2. do segments key bounds truncation when the config is on + config::enable_segments_key_bounds_truncation = true; + config::segments_key_bounds_truncation_threshold = 10; + + auto tablet_schema = create_schema(100); + std::vector<std::vector<std::string>> data {{std::string(2, 'x'), std::string(3, 'y')}, + {std::string(4, 'a'), std::string(15, 'b')}, + {std::string(18, 'c'), std::string(5, 'z')}, + {std::string(20, '0'), std::string(22, '1')}}; + auto blocks = generate_blocks(tablet_schema, data); + RowsetSharedPtr rowset = create_rowset(tablet_schema, NONOVERLAPPING, blocks, 2, false); + + auto rowset_meta = rowset->rowset_meta(); + EXPECT_EQ(true, rowset_meta->is_segments_key_bounds_truncated()); + std::vector<KeyBoundsPB> segments_key_bounds; + rowset_meta->get_segments_key_bounds(&segments_key_bounds); + EXPECT_EQ(segments_key_bounds.size(), data.size()); + check_key_bounds(data, segments_key_bounds); + } +} + +TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { Review Comment: warning: function 'TEST_F' exceeds recommended size/complexity thresholds [readability-function-size] ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { ^ ``` <details> <summary>Additional context</summary> **be/test/olap/segments_key_bounds_truncation_test.cpp:377:** 209 lines including whitespace and comments (threshold 80) ```cpp TEST_F(SegmentsKeyBoundsTruncationTest, BlockReaderJudgeFuncTest) { ^ ``` </details> -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org