This is an automated email from the ASF dual-hosted git repository.

kxiao pushed a commit to branch branch-2.0
in repository https://gitbox.apache.org/repos/asf/doris.git

commit c0ce6ac175bd42c8e6516c2faa89521e2d6fb2c8
Author: Mingyu Chen <morning...@163.com>
AuthorDate: Sat Sep 16 09:57:39 2023 +0800

    [fix](orc) fix the count(*) pushdown issue in orc format (#24446)
    
    In previous, when querying hive table in orc format, and the file is 
splitted.
    the result of select count(*) may be multiple of the real row number.
    
    This is because the number of rows should be got after orc strip prune,
    otherwise, it may return wrong result
---
 be/src/apache-orc                                  |  2 +-
 be/src/vec/exec/format/orc/vorc_reader.cpp         |  5 +-
 be/src/vec/exec/format/orc/vorc_reader.h           |  2 +
 .../tablefunction/HdfsTableValuedFunction.java     |  2 +-
 .../tvf/test_hdfs_tvf_compression.out              | 18 +++++++
 .../tvf/test_hdfs_tvf_compression.groovy           | 63 ++++++++++++++++++++++
 6 files changed, 88 insertions(+), 4 deletions(-)

diff --git a/be/src/apache-orc b/be/src/apache-orc
index 78bbe2e41f..a7c0af50f8 160000
--- a/be/src/apache-orc
+++ b/be/src/apache-orc
@@ -1 +1 @@
-Subproject commit 78bbe2e41f2140b803855d683fae5e1a4b734a37
+Subproject commit a7c0af50f8ca8ff7cddaf8675473a037f8b13143
diff --git a/be/src/vec/exec/format/orc/vorc_reader.cpp 
b/be/src/vec/exec/format/orc/vorc_reader.cpp
index 37d3e6ac7e..c88ea70dfa 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.cpp
+++ b/be/src/vec/exec/format/orc/vorc_reader.cpp
@@ -245,8 +245,6 @@ Status OrcReader::_create_file_reader() {
         }
         return Status::InternalError("Init OrcReader failed. reason = {}", 
_err_msg);
     }
-    _remaining_rows = _reader->getNumberOfRows();
-
     return Status::OK();
 }
 
@@ -789,6 +787,9 @@ Status OrcReader::set_fill_columns(
         auto& selected_type = _row_reader->getSelectedType();
         int idx = 0;
         _init_select_types(selected_type, idx);
+
+        _remaining_rows = _row_reader->getNumberOfRows();
+
     } catch (std::exception& e) {
         return Status::InternalError("Failed to create orc row reader. reason 
= {}", e.what());
     }
diff --git a/be/src/vec/exec/format/orc/vorc_reader.h 
b/be/src/vec/exec/format/orc/vorc_reader.h
index 05fe7125cd..e14c93244e 100644
--- a/be/src/vec/exec/format/orc/vorc_reader.h
+++ b/be/src/vec/exec/format/orc/vorc_reader.h
@@ -489,6 +489,8 @@ private:
     void set_remaining_rows(int64_t rows) { _remaining_rows = rows; }
 
 private:
+    // This is only for count(*) short circuit read.
+    // save the total number of rows in range
     int64_t _remaining_rows = 0;
     RuntimeProfile* _profile = nullptr;
     RuntimeState* _state = nullptr;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
index 718b8ae381..eb8e8f70f7 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/HdfsTableValuedFunction.java
@@ -70,7 +70,7 @@ public class HdfsTableValuedFunction extends 
ExternalFileTableValuedFunction {
                 // because HADOOP_FS_NAME contains upper and lower case
                 locationProperties.put(HdfsResource.HADOOP_FS_NAME, 
params.get(key));
             } else {
-                throw new AnalysisException(key + " is invalid property");
+                locationProperties.put(key, params.get(key));
             }
         }
 
diff --git 
a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out 
b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
index a92e6f28cb..6d92ffffc2 100644
--- a/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
+++ b/regression-test/data/external_table_p2/tvf/test_hdfs_tvf_compression.out
@@ -248,3 +248,21 @@ c133       TEXT    Yes     false   \N      NONE
 
 -- !plain_2 --
 
+-- !count_parquet_0 --
+1062734
+
+-- !count_parquet_1 --
+1062734
+
+-- !count_orc_0 --
+2777636
+
+-- !count_orc_1 --
+2777636
+
+-- !count_text_0 --
+144730
+
+-- !count_text_1 --
+144730
+
diff --git 
a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy 
b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
index 2f07106957..40dc3c2440 100644
--- 
a/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
+++ 
b/regression-test/suites/external_table_p2/tvf/test_hdfs_tvf_compression.groovy
@@ -105,7 +105,70 @@ suite("test_hdfs_tvf_compression", 
"p2,external,tvf,external_remote,external_rem
             "column_separator" = '\001',
             "compress_type" = "plain") where c2="abc" order by c3,c4,c10 limit 
5;
         """
+
+        // test count(*) push down
+        def test_data_dir = "hdfs://${nameNodeHost}:${hdfsPort}"
+        // parquet
+        sql """set file_split_size=0;"""
+        qt_count_parquet_0 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = 
"${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "parquet"
+        );
+        """
+
+        sql """set file_split_size=388608;"""
+        qt_count_parquet_1 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = 
"${test_data_dir}/test_data/ckbench_hits.part-00000.snappy.parquet",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "parquet"
+        );
+        """
+
+        // orc
+        sql """set file_split_size=0;"""
+        qt_count_orc_0 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "orc"
+        );
+        """
+
+        sql """set file_split_size=388608;"""
+        qt_count_orc_1 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = "${test_data_dir}/test_data/ckbench_hits.000000_0.orc",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "orc"
+        );
+        """
         
+        // text
+        sql """set file_split_size=0;"""
+        qt_count_text_0 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = 
"${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "csv"
+        );
+        """
 
+        sql """set file_split_size=388608;"""
+        qt_count_text_1 """ 
+        select count(*) from 
+        HDFS(
+            "uri" = 
"${test_data_dir}/test_data/tpcds_catalog_returns_data-m-00000.txt",
+            "fs.defaultFS" = "${baseFs}",
+            "format" = "csv"
+        );
+        """
     }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to