This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new 15f85e2cfb2 [fix] (bloom filter) Fix the bloom filter calculation for date and datetime (#43351) (#43622) 15f85e2cfb2 is described below commit 15f85e2cfb2212cfbd6df32eba880e7bcb12bf61 Author: Sun Chenyang <csun5...@gmail.com> AuthorDate: Tue Nov 12 10:56:55 2024 +0800 [fix] (bloom filter) Fix the bloom filter calculation for date and datetime (#43351) (#43622) pick from master #43351 --------- Co-authored-by: csun5285 <suncheny...@selectdb.com> --- be/src/olap/comparison_predicate.h | 13 ++- be/src/olap/tablet_meta.h | 5 + be/test/olap/date_bloom_filter_test.cpp | 195 ++++++++++++++++++++++++++++++++ 3 files changed, 212 insertions(+), 1 deletion(-) diff --git a/be/src/olap/comparison_predicate.h b/be/src/olap/comparison_predicate.h index 2e0c4db4ba0..7523e991291 100644 --- a/be/src/olap/comparison_predicate.h +++ b/be/src/olap/comparison_predicate.h @@ -202,12 +202,23 @@ public: return bf->test_bytes(_value.data, _value.size); } else { // DecimalV2 using decimal12_t in bloom filter, should convert value to decimal12_t - // Datev1/DatetimeV1 using VecDatetimeValue in bloom filter, NO need to convert. if constexpr (Type == PrimitiveType::TYPE_DECIMALV2) { decimal12_t decimal12_t_val(_value.int_value(), _value.frac_value()); return bf->test_bytes( const_cast<char*>(reinterpret_cast<const char*>(&decimal12_t_val)), sizeof(decimal12_t)); + // Datev1 using uint24_t in bloom filter + } else if constexpr (Type == PrimitiveType::TYPE_DATE) { + uint24_t date_value(_value.to_olap_date()); + return bf->test_bytes( + const_cast<char*>(reinterpret_cast<const char*>(&date_value)), + sizeof(uint24_t)); + // DatetimeV1 using int64_t in bloom filter + } else if constexpr (Type == PrimitiveType::TYPE_DATETIME) { + int64_t datetime_value(_value.to_olap_datetime()); + return bf->test_bytes( + const_cast<char*>(reinterpret_cast<const char*>(&datetime_value)), + sizeof(int64_t)); } else { return bf->test_bytes(const_cast<char*>(reinterpret_cast<const char*>(&_value)), sizeof(T)); diff --git a/be/src/olap/tablet_meta.h b/be/src/olap/tablet_meta.h index 77d5554aae2..3c36cad53ab 100644 --- a/be/src/olap/tablet_meta.h +++ b/be/src/olap/tablet_meta.h @@ -118,6 +118,11 @@ public: TabletMeta(const TabletMeta& tablet_meta); TabletMeta(TabletMeta&& tablet_meta) = delete; +// UT +#ifdef BE_TEST + TabletMeta(TabletSchemaSPtr tablet_schema) : _schema(tablet_schema) {} +#endif + // Function create_from_file is used to be compatible with previous tablet_meta. // Previous tablet_meta is a physical file in tablet dir, which is not stored in rocksdb. Status create_from_file(const std::string& file_path); diff --git a/be/test/olap/date_bloom_filter_test.cpp b/be/test/olap/date_bloom_filter_test.cpp new file mode 100644 index 00000000000..d3839f8c268 --- /dev/null +++ b/be/test/olap/date_bloom_filter_test.cpp @@ -0,0 +1,195 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include <gtest/gtest.h> + +#include "olap/comparison_predicate.h" +#include "olap/rowset/beta_rowset.h" +#include "olap/rowset/beta_rowset_writer.h" +#include "olap/rowset/rowset_factory.h" +#include "olap/rowset/segment_v2/bloom_filter_index_reader.h" +#include "olap/storage_engine.h" +#include "util/date_func.h" +#include "vec/runtime/vdatetime_value.h" + +namespace doris { + +using namespace doris::vectorized; + +constexpr static uint32_t MAX_PATH_LEN = 1024; +constexpr static std::string_view dest_dir = "./ut_dir/date_bloom_filter"; +static int64_t inc_id = 1000; + +class DateBloomFilterTest : public ::testing::Test { +protected: + void SetUp() override { + // absolute dir + char buffer[MAX_PATH_LEN]; + EXPECT_NE(getcwd(buffer, MAX_PATH_LEN), nullptr); + _curreent_dir = std::string(buffer); + _absolute_dir = _curreent_dir + std::string(dest_dir); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_absolute_dir).ok()); + + // storage engine + doris::EngineOptions options; + auto* engine = new StorageEngine(options); + _engine_ref = engine; + _data_dir = std::make_unique<DataDir>(_absolute_dir); + static_cast<void>(_data_dir->update_capacity()); + ExecEnv::GetInstance()->set_storage_engine(engine); + + // tablet_schema + TabletSchemaPB schema_pb; + schema_pb.set_keys_type(KeysType::DUP_KEYS); + + construct_column(schema_pb.add_column(), 0, "DATE", "date_column"); + construct_column(schema_pb.add_column(), 1, "DATETIME", "datetime_column"); + + _tablet_schema.reset(new TabletSchema); + _tablet_schema->init_from_pb(schema_pb); + + // tablet + TabletMetaSharedPtr tablet_meta(new TabletMeta(_tablet_schema)); + + _tablet.reset(new Tablet(*_engine_ref, tablet_meta, _data_dir.get())); + EXPECT_TRUE(_tablet->init().ok()); + } + void TearDown() override { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_absolute_dir).ok()); + if (_engine_ref != nullptr) { + _engine_ref->stop(); + delete _engine_ref; + _engine_ref = nullptr; + ExecEnv::GetInstance()->set_storage_engine(nullptr); + } + } + + void construct_column(ColumnPB* column_pb, int32_t col_unique_id, + const std::string& column_type, const std::string& column_name) { + column_pb->set_unique_id(col_unique_id); + column_pb->set_name(column_name); + column_pb->set_type(column_type); + column_pb->set_is_key(true); + column_pb->set_is_nullable(true); + column_pb->set_is_bf_column(true); + } + + RowsetWriterContext rowset_writer_context() { + RowsetWriterContext context; + RowsetId rowset_id; + rowset_id.init(inc_id); + context.rowset_id = rowset_id; + context.rowset_type = BETA_ROWSET; + context.data_dir = _data_dir.get(); + context.rowset_state = VISIBLE; + context.tablet_schema = _tablet_schema; + context.rowset_dir = _tablet->tablet_path(); + context.version = Version(inc_id, inc_id); + context.max_rows_per_segment = 200; + inc_id++; + return context; + } + + DateBloomFilterTest() = default; + ~DateBloomFilterTest() override = default; + +private: + TabletSchemaSPtr _tablet_schema = nullptr; + StorageEngine* _engine_ref = nullptr; + std::unique_ptr<DataDir> _data_dir = nullptr; + TabletSharedPtr _tablet = nullptr; + std::string _absolute_dir; + std::string _curreent_dir; +}; + +TEST_F(DateBloomFilterTest, query_index_test) { + EXPECT_TRUE(io::global_local_filesystem()->delete_directory(_tablet->tablet_path()).ok()); + EXPECT_TRUE(io::global_local_filesystem()->create_directory(_tablet->tablet_path()).ok()); + + RowsetSharedPtr rowset; + std::unique_ptr<RowsetWriter> rowset_writer; + const auto& res = + RowsetFactory::create_rowset_writer(rowset_writer_context(), false, &rowset_writer); + + Block block = _tablet_schema->create_block(); + auto columns = block.mutate_columns(); + + auto date = timestamp_from_date("2024-11-08"); + auto datetime = timestamp_from_datetime("2024-11-08 09:00:00"); + uint24_t olap_date_value(date.to_olap_date()); + uint64_t olap_datetime_value(datetime.to_olap_datetime()); + columns[0]->insert_many_fix_len_data(reinterpret_cast<const char*>(&olap_date_value), 1); + columns[1]->insert_many_fix_len_data(reinterpret_cast<const char*>(&olap_datetime_value), 1); + + date = timestamp_from_date("2024-11-09"); + datetime = timestamp_from_datetime("2024-11-09 09:00:00"); + olap_date_value = date.to_olap_date(); + olap_datetime_value = datetime.to_olap_datetime(); + columns[0]->insert_many_fix_len_data(reinterpret_cast<const char*>(&olap_date_value), 1); + columns[1]->insert_many_fix_len_data(reinterpret_cast<const char*>(&olap_datetime_value), 1); + + EXPECT_TRUE(rowset_writer->add_block(&block).ok()); + EXPECT_TRUE(rowset_writer->flush().ok()); + EXPECT_TRUE(rowset_writer->build(rowset).ok()); + EXPECT_TRUE(_tablet->add_rowset(rowset).ok()); + + segment_v2::SegmentSharedPtr segment; + EXPECT_TRUE(((BetaRowset*)rowset.get())->load_segment(0, &segment).ok()); + auto st = segment->_create_column_readers(*(segment->_footer_pb)); + EXPECT_TRUE(st.ok()); + + // date + { + const auto& reader = segment->_column_readers[0]; + std::unique_ptr<BloomFilterIndexIterator> bf_iter; + EXPECT_TRUE(reader->_bloom_filter_index->load(true, true).ok()); + EXPECT_TRUE(reader->_bloom_filter_index->new_iterator(&bf_iter).ok()); + std::unique_ptr<BloomFilter> bf; + EXPECT_TRUE(bf_iter->read_bloom_filter(0, &bf).ok()); + auto test = [&](const std::string& query_string, bool result) { + auto date = timestamp_from_date(query_string); + std::unique_ptr<ComparisonPredicateBase<TYPE_DATE, PredicateType::EQ>> date_pred( + new ComparisonPredicateBase<TYPE_DATE, PredicateType::EQ>(0, date)); + EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); + }; + test("2024-11-08", true); + test("2024-11-09", true); + test("2024-11-20", false); + } + + // datetime + { + const auto& reader = segment->_column_readers[1]; + std::unique_ptr<BloomFilterIndexIterator> bf_iter; + EXPECT_TRUE(reader->_bloom_filter_index->load(true, true).ok()); + EXPECT_TRUE(reader->_bloom_filter_index->new_iterator(&bf_iter).ok()); + std::unique_ptr<BloomFilter> bf; + EXPECT_TRUE(bf_iter->read_bloom_filter(0, &bf).ok()); + auto test = [&](const std::string& query_string, bool result) { + auto datetime = timestamp_from_datetime(query_string); + std::unique_ptr<ComparisonPredicateBase<TYPE_DATETIME, PredicateType::EQ>> date_pred( + new ComparisonPredicateBase<TYPE_DATETIME, PredicateType::EQ>(0, datetime)); + EXPECT_EQ(date_pred->evaluate_and(bf.get()), result); + }; + test("2024-11-08 09:00:00", true); + test("2024-11-09 09:00:00", true); + test("2024-11-20 09:00:00", false); + } +} +} // namespace doris --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org