This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new e25717458e9 [opt](catalog) add some profile for parquet reader and change meta cache config (#37040) (#37146) e25717458e9 is described below commit e25717458e97074b27d1a43c6d09b97a2d4e56df Author: Mingyu Chen <morning...@163.com> AuthorDate: Tue Jul 2 20:58:43 2024 +0800 [opt](catalog) add some profile for parquet reader and change meta cache config (#37040) (#37146) bp #37040 --- be/src/common/config.cpp | 5 +++++ be/src/common/config.h | 2 ++ be/src/vec/exec/format/parquet/vparquet_reader.cpp | 23 +++++++++++++++++----- be/src/vec/exec/format/parquet/vparquet_reader.h | 4 ++++ .../main/java/org/apache/doris/common/Config.java | 13 ++++++++---- .../doris/datasource/hive/HiveMetaStoreCache.java | 6 +++--- 6 files changed, 41 insertions(+), 12 deletions(-) diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp index 563e4750165..fe811165c17 100644 --- a/be/src/common/config.cpp +++ b/be/src/common/config.cpp @@ -1250,6 +1250,11 @@ DEFINE_Int64(min_row_group_size, "134217728"); // The time out milliseconds for remote fetch schema RPC, default 60s DEFINE_mInt64(fetch_remote_schema_rpc_timeout_ms, "60000"); +// If set to false, the parquet reader will not use page index to filter data. +// This is only for debug purpose, in case sometimes the page index +// filter wrong data. +DEFINE_mBool(enable_parquet_page_index, "true"); + // clang-format off #ifdef BE_TEST // test s3 diff --git a/be/src/common/config.h b/be/src/common/config.h index 21325a0f011..891a8333148 100644 --- a/be/src/common/config.h +++ b/be/src/common/config.h @@ -1334,6 +1334,8 @@ DECLARE_mInt64(fetch_remote_schema_rpc_timeout_ms); // The minimum row group size when exporting Parquet files. DECLARE_Int64(min_row_group_size); +DECLARE_mBool(enable_parquet_page_index); + #ifdef BE_TEST // test s3 DECLARE_String(test_s3_resource); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp b/be/src/vec/exec/format/parquet/vparquet_reader.cpp index f99786dc6e2..f3b9f2ad55c 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp +++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp @@ -148,6 +148,10 @@ void ParquetReader::_init_profile() { ADD_CHILD_COUNTER_WITH_LEVEL(_profile, "FileNum", TUnit::UNIT, parquet_profile, 1); _parquet_profile.page_index_filter_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexFilterTime", parquet_profile, 1); + _parquet_profile.read_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexReadTime", parquet_profile, 1); + _parquet_profile.parse_page_index_time = + ADD_CHILD_TIMER_WITH_LEVEL(_profile, "PageIndexParseTime", parquet_profile, 1); _parquet_profile.row_group_filter_time = ADD_CHILD_TIMER_WITH_LEVEL(_profile, "RowGroupFilterTime", parquet_profile, 1); @@ -747,25 +751,32 @@ Status ParquetReader::_process_page_index(const tparquet::RowGroup& row_group, return Status::OK(); } PageIndex page_index; - if (!_has_page_index(row_group.columns, page_index)) { + if (!config::enable_parquet_page_index || !_has_page_index(row_group.columns, page_index)) { read_whole_row_group(); return Status::OK(); } uint8_t col_index_buff[page_index._column_index_size]; size_t bytes_read = 0; Slice result(col_index_buff, page_index._column_index_size); - RETURN_IF_ERROR( - _file_reader->read_at(page_index._column_index_start, result, &bytes_read, _io_ctx)); + { + SCOPED_RAW_TIMER(&_statistics.read_page_index_time); + RETURN_IF_ERROR(_file_reader->read_at(page_index._column_index_start, result, &bytes_read, + _io_ctx)); + } _column_statistics.read_bytes += bytes_read; auto& schema_desc = _file_metadata->schema(); std::vector<RowRange> skipped_row_ranges; uint8_t off_index_buff[page_index._offset_index_size]; Slice res(off_index_buff, page_index._offset_index_size); - RETURN_IF_ERROR( - _file_reader->read_at(page_index._offset_index_start, res, &bytes_read, _io_ctx)); + { + SCOPED_RAW_TIMER(&_statistics.read_page_index_time); + RETURN_IF_ERROR( + _file_reader->read_at(page_index._offset_index_start, res, &bytes_read, _io_ctx)); + } _column_statistics.read_bytes += bytes_read; // read twice: parse column index & parse offset index _column_statistics.meta_read_calls += 2; + SCOPED_RAW_TIMER(&_statistics.parse_page_index_time); for (auto& read_col : _read_columns) { auto conjunct_iter = _colname_to_value_range->find(read_col); if (_colname_to_value_range->end() == conjunct_iter) { @@ -935,6 +946,8 @@ void ParquetReader::_collect_profile() { COUNTER_UPDATE(_parquet_profile.open_file_time, _statistics.open_file_time); COUNTER_UPDATE(_parquet_profile.open_file_num, _statistics.open_file_num); COUNTER_UPDATE(_parquet_profile.page_index_filter_time, _statistics.page_index_filter_time); + COUNTER_UPDATE(_parquet_profile.read_page_index_time, _statistics.read_page_index_time); + COUNTER_UPDATE(_parquet_profile.parse_page_index_time, _statistics.parse_page_index_time); COUNTER_UPDATE(_parquet_profile.row_group_filter_time, _statistics.row_group_filter_time); COUNTER_UPDATE(_parquet_profile.skip_page_header_num, _column_statistics.skip_page_header_num); diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.h b/be/src/vec/exec/format/parquet/vparquet_reader.h index 033d519c5a1..52700aafb7f 100644 --- a/be/src/vec/exec/format/parquet/vparquet_reader.h +++ b/be/src/vec/exec/format/parquet/vparquet_reader.h @@ -89,6 +89,8 @@ public: int64_t open_file_num = 0; int64_t row_group_filter_time = 0; int64_t page_index_filter_time = 0; + int64_t read_page_index_time = 0; + int64_t parse_page_index_time = 0; }; ParquetReader(RuntimeProfile* profile, const TFileScanRangeParams& params, @@ -170,6 +172,8 @@ private: RuntimeProfile::Counter* open_file_num = nullptr; RuntimeProfile::Counter* row_group_filter_time = nullptr; RuntimeProfile::Counter* page_index_filter_time = nullptr; + RuntimeProfile::Counter* read_page_index_time = nullptr; + RuntimeProfile::Counter* parse_page_index_time = nullptr; RuntimeProfile::Counter* file_read_time = nullptr; RuntimeProfile::Counter* file_read_calls = nullptr; diff --git a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java index b51ab170f02..26ef8852b45 100644 --- a/fe/fe-common/src/main/java/org/apache/doris/common/Config.java +++ b/fe/fe-common/src/main/java/org/apache/doris/common/Config.java @@ -1944,16 +1944,21 @@ public class Config extends ConfigBase { @ConfField(mutable = false, masterOnly = false) public static long max_hive_partition_cache_num = 100000; - @ConfField(mutable = false, masterOnly = false, description = {"Hive表到分区名列表缓存的最大数量。", - "Max cache number of hive table to partition names list."}) + @ConfField(mutable = false, masterOnly = false, description = {"Hive表名缓存的最大数量。", + "Max cache number of hive table name list."}) public static long max_hive_table_cache_num = 1000; + @ConfField(mutable = false, masterOnly = false, description = { + "Hive分区表缓存的最大数量", "Max cache number of hive partition table" + }) + public static long max_hive_partition_table_cache_num = 1000; + @ConfField(mutable = false, masterOnly = false, description = {"获取Hive分区值时候的最大返回数量,-1代表没有限制。", - "Max number of hive partition values to return while list partitions, -1 means no limitation."}) + "Max number of hive partition values to return while list partitions, -1 means no limitation."}) public static short max_hive_list_partition_num = -1; @ConfField(mutable = false, masterOnly = false, description = {"远程文件系统缓存的最大数量", - "Max cache number of remote file system."}) + "Max cache number of remote file system."}) public static long max_remote_file_system_cache_num = 100; @ConfField(mutable = false, masterOnly = false, description = {"外表行数缓存最大数量", diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java index f402d27cf6d..da90fadf835 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreCache.java @@ -136,16 +136,16 @@ public class HiveMetaStoreCache { **/ private void init() { CacheFactory partitionValuesCacheFactory = new CacheFactory( - OptionalLong.of(86400L), + OptionalLong.of(28800L), OptionalLong.of(Config.external_cache_expire_time_minutes_after_access * 60L), - Config.max_hive_table_cache_num, + Config.max_hive_partition_table_cache_num, false, null); partitionValuesCache = partitionValuesCacheFactory.buildCache(key -> loadPartitionValues(key), null, refreshExecutor); CacheFactory partitionCacheFactory = new CacheFactory( - OptionalLong.of(86400L), + OptionalLong.of(28800L), OptionalLong.of(Config.external_cache_expire_time_minutes_after_access * 60L), Config.max_hive_partition_cache_num, false, --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org