This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit 9787b7fa816829e88a153acc6a5aaad1a28c9da8 Author: HappenLee <happen...@hotmail.com> AuthorDate: Tue Aug 1 12:24:00 2023 +0800 [Opt](exec) opt the performance of date parquet convert by date dict (#22384) beforeļ¼ mysql> select count(l_commitdate) from lineitem; +---------------------+ | count(l_commitdate) | +---------------------+ | 600037902 | +---------------------+ 1 row in set (0.86 sec) after: mysql> select count(l_commitdate) from lineitem; +---------------------+ | count(l_commitdate) | +---------------------+ | 600037902 | +---------------------+ 1 row in set (0.36 sec) --- be/src/service/doris_main.cpp | 1 + be/src/vec/exec/format/parquet/decoder.cpp | 2 +- .../vec/exec/format/parquet/fix_length_dict_decoder.hpp | 10 +++++++--- .../vec/exec/format/parquet/fix_length_plain_decoder.cpp | 10 ++++++++-- be/src/vec/runtime/vdatetime_value.cpp | 15 +++++++++++++++ be/src/vec/runtime/vdatetime_value.h | 3 +++ be/test/vec/exec/parquet/parquet_thrift_test.cpp | 1 + 7 files changed, 36 insertions(+), 6 deletions(-) diff --git a/be/src/service/doris_main.cpp b/be/src/service/doris_main.cpp index 57dec22820..6271b34564 100644 --- a/be/src/service/doris_main.cpp +++ b/be/src/service/doris_main.cpp @@ -433,6 +433,7 @@ int main(int argc, char** argv) { auto exec_env = doris::ExecEnv::GetInstance(); doris::ExecEnv::init(exec_env, paths); doris::TabletSchemaCache::create_global_schema_cache(); + doris::vectorized::init_date_day_offset_dict(); // init s3 write buffer pool doris::io::S3FileBufferPool* s3_buffer_pool = doris::io::S3FileBufferPool::GetInstance(); diff --git a/be/src/vec/exec/format/parquet/decoder.cpp b/be/src/vec/exec/format/parquet/decoder.cpp index 539fc04a10..bf8ef0b233 100644 --- a/be/src/vec/exec/format/parquet/decoder.cpp +++ b/be/src/vec/exec/format/parquet/decoder.cpp @@ -181,7 +181,7 @@ void Decoder::init(FieldSchema* field_schema, cctz::time_zone* ctz) { if (_decode_params->ctz) { VecDateTimeValue t; t.from_unixtime(0, *_decode_params->ctz); - _decode_params->offset_days = doris::calc_daynr(t.year(), t.month(), t.day()); + _decode_params->offset_days = t.day() == 31 ? 0 : 1; } } } // namespace doris::vectorized diff --git a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp index 817b5e7f96..bb95fb426f 100644 --- a/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp +++ b/be/src/vec/exec/format/parquet/fix_length_dict_decoder.hpp @@ -216,7 +216,7 @@ protected: size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); size_t dict_index = 0; - + auto* __restrict date_day_offset_dict = get_date_day_offset_dict(); ColumnSelectVector::DataReadType read_type; while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { switch (read_type) { @@ -224,11 +224,15 @@ protected: for (size_t i = 0; i < run_length; ++i) { int64_t date_value = _dict_items[_indexes[dict_index++]] + _decode_params->offset_days; - auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); - v.get_date_from_daynr(date_value); + DCHECK_LT(date_value, 25500); if constexpr (std::is_same_v<CppType, VecDateTimeValue>) { + auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); + v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE); // we should cast to date if using date v1. v.cast_to_date(); + } else { + reinterpret_cast<CppType&>(column_data[data_index++]) = + date_day_offset_dict[date_value]; } } break; diff --git a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp index 940e70db79..3fe58e6a5d 100644 --- a/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp +++ b/be/src/vec/exec/format/parquet/fix_length_plain_decoder.cpp @@ -248,6 +248,7 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, size_t data_index = column_data.size(); column_data.resize(data_index + select_vector.num_values() - select_vector.num_filtered()); ColumnSelectVector::DataReadType read_type; + auto* __restrict date_day_offset_dict = get_date_day_offset_dict(); while (size_t run_length = select_vector.get_next_run<has_filter>(&read_type)) { switch (read_type) { @@ -256,11 +257,16 @@ Status FixLengthPlainDecoder::_decode_date(MutableColumnPtr& doris_column, char* buf_start = _data->data + _offset; int64_t date_value = static_cast<int64_t>(*reinterpret_cast<int32_t*>(buf_start)) + _decode_params->offset_days; - auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); - v.get_date_from_daynr(date_value); + DCHECK_LT(date_value, 25500); + DCHECK_GE(date_value, 0); if constexpr (std::is_same_v<CppType, VecDateTimeValue>) { + auto& v = reinterpret_cast<CppType&>(column_data[data_index++]); + v.create_from_date_v2(date_day_offset_dict[date_value], TIME_DATE); // we should cast to date if using date v1. v.cast_to_date(); + } else { + reinterpret_cast<CppType&>(column_data[data_index++]) = + date_day_offset_dict[date_value]; } _offset += _type_length; } diff --git a/be/src/vec/runtime/vdatetime_value.cpp b/be/src/vec/runtime/vdatetime_value.cpp index 27926d1ee6..9675ab63fa 100644 --- a/be/src/vec/runtime/vdatetime_value.cpp +++ b/be/src/vec/runtime/vdatetime_value.cpp @@ -2613,6 +2613,21 @@ typename DateV2Value<T>::underlying_value DateV2Value<T>::to_date_int_val() cons return int_val_; } +static std::array<DateV2Value<DateV2ValueType>, 25500> DATE_DAY_OFFSET_DICT; + +void init_date_day_offset_dict() { + DateV2Value<DateV2ValueType> d; + d.set_time(1969, 12, 31, 0, 0, 0, 0); + for (int i = 0; i < DATE_DAY_OFFSET_DICT.size(); ++i) { + DATE_DAY_OFFSET_DICT[i] = d; + d += 1; + } +} + +DateV2Value<DateV2ValueType>* get_date_day_offset_dict() { + return DATE_DAY_OFFSET_DICT.data(); +} + template <typename T> uint32_t DateV2Value<T>::set_date_uint32(uint32_t int_val) { union DateV2UInt32Union { diff --git a/be/src/vec/runtime/vdatetime_value.h b/be/src/vec/runtime/vdatetime_value.h index b33fe1e30e..e23aee6a81 100644 --- a/be/src/vec/runtime/vdatetime_value.h +++ b/be/src/vec/runtime/vdatetime_value.h @@ -1462,6 +1462,9 @@ class DataTypeDateTime; class DataTypeDateV2; class DataTypeDateTimeV2; +[[maybe_unused]] void init_date_day_offset_dict(); +[[maybe_unused]] DateV2Value<DateV2ValueType>* get_date_day_offset_dict(); + template <typename T> struct DateTraits {}; diff --git a/be/test/vec/exec/parquet/parquet_thrift_test.cpp b/be/test/vec/exec/parquet/parquet_thrift_test.cpp index 9b36b55464..bb77b32e58 100644 --- a/be/test/vec/exec/parquet/parquet_thrift_test.cpp +++ b/be/test/vec/exec/parquet/parquet_thrift_test.cpp @@ -437,6 +437,7 @@ static void read_parquet_data_and_check(const std::string& parquet_file, } TEST_F(ParquetThriftReaderTest, type_decoder) { + init_date_day_offset_dict(); read_parquet_data_and_check("./be/test/exec/test_data/parquet_scanner/type-decoder.parquet", "./be/test/exec/test_data/parquet_scanner/type-decoder.txt", 10); } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org