This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
commit c0ce6ac175bd42c8e6516c2faa89521e2d6fb2c8 Author: Mingyu Chen <morning...@163.com> AuthorDate: Sat Sep 16 09:57:39 2023 +0800 [fix](orc) fix the count(*) pushdown issue in orc format (#24446) In previous, when querying hive table in orc format, and the file is splitted. the result of select count(*) may be multiple of the real row number. This is because the number of rows should be got after orc strip prune, otherwise, it may return wrong result --- be/src/apache-orc | 2 +- be/src/vec/exec/format/orc/vorc_reader.cpp | 5 +- be/src/vec/exec/format/orc/vorc_reader.h | 2 + .../tablefunction/HdfsTableValuedFunction.java | 2 +- .../tvf/test_hdfs_tvf_compression.out | 18 +++++++ .../tvf/test_hdfs_tvf_compression.groovy | 63 ++++++++++++++++++++++ 6 files changed, 88 insertions(+), 4 deletions(-) diff --git a/be/src/apache-orc b/be/src/apache-orc index 78bbe2e41f..a7c0af50f8 160000 --- a/be/src/apache-orc +++ b/be/src/apache-orc @@ -1 +1 @@ -Subproject commit 78bbe2e41f2140b803855d683fae5e1a4b734a37 +Subproject commit a7c0af50f8ca8ff7cddaf8675473a037f8b13143 diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp b/be/src/vec/exec/format/orc/vorc_reader.cpp index 37d3e6ac7e..c88ea70dfa 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.cpp +++ b/be/src/vec/exec/format/orc/vorc_reader.cpp @@ -245,8 +245,6 @@ Status OrcReader::_create_file_reader() { } return Status::InternalError("Init OrcReader failed. reason = {}", _err_msg); } - _remaining_rows = _reader->getNumberOfRows(); - return Status::OK(); } @@ -789,6 +787,9 @@ Status OrcReader::set_fill_columns( auto& selected_type = _row_reader->getSelectedType(); int idx = 0; _init_select_types(selected_type, idx); + + _remaining_rows = _row_reader->getNumberOfRows(); + } catch (std::exception& e) { return Status::InternalError("Failed to create orc row reader. reason = {}", e.what()); } diff --git a/be/src/vec/exec/format/orc/vorc_reader.h b/be/src/vec/exec/format/orc/vorc_reader.h index 05fe7125cd..e14c93244e 100644 --- a/be/src/vec/exec/format/orc/vorc_reader.h +++ b/be/src/vec/exec/format/orc/vorc_reader.h @@ -489,6 +489,8 @@ private: void set_remaining_rows(int64_t rows) { _remaining_rows = rows; } private: + // This is only for count(*) short circuit read. + // save the total number of rows in range int64_t _remaining_rows = 0; RuntimeProfile* _profile = nullptr; RuntimeState* _state = nullptr; diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java index 718b8ae381..eb8e8f70f7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java @@ -70,7 +70,7 @@ public class HdfsTableValuedFunction extends ExternalFileTableValuedFunction { // because HADOOP_FS_NAME contains upper and lower case locationProperties.put(HdfsResource.HADOOP_FS_NAME, params.get(key)); } else { - throw new AnalysisException(key + " is invalid property"); + locationProperties.put(key, params.get(key)); } } diff --git a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out index a92e6f28cb..6d92ffffc2 100644 --- a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out +++ b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out @@ -248,3 +248,21 @@ c133 TEXT Yes false \N NONE -- !plain_2 -- +-- !count_parquet_0 -- +1062734 + +-- !count_parquet_1 -- +1062734 + +-- !count_orc_0 -- +2777636 + +-- !count_orc_1 -- +2777636 + +-- !count_text_0 -- +144730 + +-- !count_text_1 -- +144730 + diff --git a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy index 2f07106957..40dc3c2440 100644 --- a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy +++ b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy @@ -105,7 +105,70 @@ suite("test_hdfs_tvf_compression", "p2,external,tvf,external_remote,external_rem "column_separator" = '\001', "compress_type" = "plain") where c2="abc" order by c3,c4,c10 limit 5; """ + + // test count(*) push down + def test_data_dir = "hdfs://${nameNodeHost}:${hdfsPort}" + // parquet + sql """set file_split_size=0;""" + qt_count_parquet_0 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet", + "fs.defaultFS" = "${baseFs}", + "format" = "parquet" + ); + """ + + sql """set file_split_size=388608;""" + qt_count_parquet_1 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet", + "fs.defaultFS" = "${baseFs}", + "format" = "parquet" + ); + """ + + // orc + sql """set file_split_size=0;""" + qt_count_orc_0 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc", + "fs.defaultFS" = "${baseFs}", + "format" = "orc" + ); + """ + + sql """set file_split_size=388608;""" + qt_count_orc_1 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc", + "fs.defaultFS" = "${baseFs}", + "format" = "orc" + ); + """ + // text + sql """set file_split_size=0;""" + qt_count_text_0 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt", + "fs.defaultFS" = "${baseFs}", + "format" = "csv" + ); + """ + sql """set file_split_size=388608;""" + qt_count_text_1 """ + select count(*) from + HDFS( + "uri" = "${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt", + "fs.defaultFS" = "${baseFs}", + "format" = "csv" + ); + """ } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org