This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 706c716a51d branch-3.1: [fix](be) fix parquet file reader not updating
page index when processing it #52228 (#52783)
706c716a51d is described below
commit 706c716a51df16a155c7d2f95afd952ce6683657
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Jul 5 20:23:44 2025 +0800
branch-3.1: [fix](be) fix parquet file reader not updating page index when
processing it #52228 (#52783)
Cherry-picked from #52228
Co-authored-by: SWEI <[email protected]>
Co-authored-by: zengsiwei <[email protected]>
Co-authored-by: suxiaogang223 <[email protected]>
---
be/src/common/config.cpp | 2 +-
be/src/vec/exec/format/parquet/vparquet_reader.cpp | 2 +-
.../parquet/small_2rowgroup.parquet | Bin 0 -> 13059 bytes
.../hive/test_hive_parquet_skip_page.out | Bin 31833 -> 31867 bytes
.../hive/test_hive_parquet_skip_page.groovy | 12 +++++++++++-
5 files changed, 13 insertions(+), 3 deletions(-)
diff --git a/be/src/common/config.cpp b/be/src/common/config.cpp
index 1c9ed17700a..c603d7d8e7c 100644
--- a/be/src/common/config.cpp
+++ b/be/src/common/config.cpp
@@ -1472,7 +1472,7 @@ DEFINE_mInt64(compaction_batch_size, "-1");
// If set to false, the parquet reader will not use page index to filter data.
// This is only for debug purpose, in case sometimes the page index
// filter wrong data.
-DEFINE_mBool(enable_parquet_page_index, "false");
+DEFINE_mBool(enable_parquet_page_index, "true");
DEFINE_mBool(ignore_not_found_file_in_external_table, "true");
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index be149991759..a38031a668f 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -863,7 +863,7 @@ Status ParquetReader::_process_page_index(const
tparquet::RowGroup& row_group,
// use the union row range
skipped_row_ranges.emplace_back(skipped_row_range);
}
- _col_offsets.emplace(parquet_col_id, offset_index);
+ _col_offsets[parquet_col_id] = offset_index;
}
if (skipped_row_ranges.empty()) {
read_whole_row_group();
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
new file mode 100644
index 00000000000..dcd05f5e28e
Binary files /dev/null and
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/parquet/small_2rowgroup.parquet
differ
diff --git
a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out
b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out
index 6c869dbc789..a8973479e42 100644
Binary files
a/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out
and
b/regression-test/data/external_table_p0/hive/test_hive_parquet_skip_page.out
differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
index ebdbedf139d..0bded7d820a 100644
---
a/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
+++
b/regression-test/suites/external_table_p0/hive/test_hive_parquet_skip_page.groovy
@@ -94,11 +94,21 @@ suite("test_hive_parquet_skip_page",
"p0,external,hive,external_docker,external_
return;
}
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+ def hdfs_port = context.config.otherConfigs.get("hive2HdfsPort")
+ def defaultFS = "hdfs://${externalEnvIp}:${hdfs_port}"
+ def hdfsUserName = "doris"
+ def uri = "${defaultFS}" +
"/user/doris/preinstalled_data/parquet/small_2rowgroup.parquet"
+ qt_small_2rowgroup """ select * from HDFS(
+ "uri" = "${uri}",
+ "hadoop.username" = "${hdfsUserName}",
+ "format" = "parquet") where a = 1024 or a = 4049
+ order by a;"""
+
for (String hivePrefix : ["hive2", "hive3"]) {
try {
String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
String catalog_name = "${hivePrefix}_test_parquet_skip_page"
- String externalEnvIp =
context.config.otherConfigs.get("externalEnvIp")
sql """drop catalog if exists ${catalog_name}"""
sql """create catalog if not exists ${catalog_name} properties (
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]