[doris] branch master updated: [Fix](parquet-reader) Fix parquet string column min max statistics issue which caused query result incorrectly. (#21675)

morningman Thu, 13 Jul 2023 09:09:57 -0700

This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/master by this push:
     new 6fd8f5cd2f [Fix](parquet-reader) Fix parquet string column min max 
statistics issue which caused query result incorrectly. (#21675)
6fd8f5cd2f is described below

commit 6fd8f5cd2f1a45aa42776283f25d650572e0ffd4
Author: Qi Chen <kaka11.c...@gmail.com>
AuthorDate: Fri Jul 14 00:09:41 2023 +0800

    [Fix](parquet-reader) Fix parquet string column min max statistics issue 
which caused query result incorrectly. (#21675)
    
    In parquet, min and max statistics may not be able to handle UTF8 correctly.
    Current processing method is using min_value and max_value statistics 
introduced by PARQUET-1025 if they are used.
    If not, current processing method is temporarily ignored. A better way is 
try to read min and max statistics if it contains
    only ASCII characters. I will improve it in the future PR.
---
 be/src/vec/exec/format/parquet/parquet_pred_cmp.h  |  14 +-
 be/src/vec/exec/format/parquet/vparquet_reader.cpp |  15 ++-
 .../hive/test_multi_langs.out                      | 148 +++++++++++++++++++++
 .../hive/test_multi_langs.groovy                   |  61 +++++++++
 4 files changed, 231 insertions(+), 7 deletions(-)

diff --git a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h 
b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
index 8d3057312b..c76fa95f4a 100644
--- a/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
+++ b/be/src/vec/exec/format/parquet/parquet_pred_cmp.h
@@ -120,7 +120,7 @@ private:
     static bool _filter_by_min_max(const ColumnValueRange<primitive_type>& 
col_val_range,
                                    const ScanPredicate& predicate, const 
FieldSchema* col_schema,
                                    const std::string& encoded_min, const 
std::string& encoded_max,
-                                   const cctz::time_zone& ctz) {
+                                   const cctz::time_zone& ctz, bool 
use_min_max_value = false) {
         using CppType = typename PrimitiveTypeTraits<primitive_type>::CppType;
         std::vector<CppType> predicate_values;
         for (const void* v : predicate.values) {
@@ -144,6 +144,13 @@ private:
         case TYPE_CHAR:
             [[fallthrough]];
         case TYPE_STRING:
+            // TODO: In parquet, min and max statistics may not be able to 
handle UTF8 correctly.
+            // Current processing method is using min_value and max_value 
statistics introduced by PARQUET-1025 if they are used.
+            // If not, current processing method is temporarily ignored. A 
better way is try to read min and max statistics
+            // if it contains only ASCII characters.
+            if (!use_min_max_value) {
+                return false;
+            }
             if constexpr (std::is_same_v<CppType, StringRef>) {
                 min_value = StringRef(encoded_min);
                 max_value = StringRef(encoded_max);
@@ -372,7 +379,8 @@ public:
     static bool filter_by_stats(const ColumnValueRangeType& col_val_range,
                                 const FieldSchema* col_schema, bool 
is_set_min_max,
                                 const std::string& encoded_min, const 
std::string& encoded_max,
-                                bool is_all_null, const cctz::time_zone& ctz) {
+                                bool is_all_null, const cctz::time_zone& ctz,
+                                bool use_min_max_value = false) {
         bool need_filter = false;
         std::visit(
                 [&](auto&& range) {
@@ -387,7 +395,7 @@ public:
                     }
                     for (auto& filter : filters) {
                         need_filter |= _filter_by_min_max(range, filter, 
col_schema, encoded_min,
-                                                          encoded_max, ctz);
+                                                          encoded_max, ctz, 
use_min_max_value);
                         if (need_filter) {
                             break;
                         }
diff --git a/be/src/vec/exec/format/parquet/vparquet_reader.cpp 
b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
index 9b179384e2..fed33b6d28 100644
--- a/be/src/vec/exec/format/parquet/vparquet_reader.cpp
+++ b/be/src/vec/exec/format/parquet/vparquet_reader.cpp
@@ -836,15 +836,22 @@ Status ParquetReader::_process_column_stat_filter(const 
std::vector<tparquet::Co
         auto& statistic = meta_data.statistics;
         bool is_all_null =
                 (statistic.__isset.null_count && statistic.null_count == 
meta_data.num_values);
-        bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min);
+        bool is_set_min_max = (statistic.__isset.max && statistic.__isset.min) 
||
+                              (statistic.__isset.max_value && 
statistic.__isset.min_value);
         if ((!is_set_min_max) && (!is_all_null)) {
             continue;
         }
         const FieldSchema* col_schema = schema_desc.get_column(col_name);
         // Min-max of statistic is plain-encoded value
-        *filter_group =
-                ParquetPredicate::filter_by_stats(slot_iter->second, 
col_schema, is_set_min_max,
-                                                  statistic.min, 
statistic.max, is_all_null, *_ctz);
+        if (statistic.__isset.min_value) {
+            *filter_group = ParquetPredicate::filter_by_stats(
+                    slot_iter->second, col_schema, is_set_min_max, 
statistic.min_value,
+                    statistic.max_value, is_all_null, *_ctz, true);
+        } else {
+            *filter_group = ParquetPredicate::filter_by_stats(
+                    slot_iter->second, col_schema, is_set_min_max, 
statistic.min, statistic.max,
+                    is_all_null, *_ctz, false);
+        }
         if (*filter_group) {
             break;
         }
diff --git 
a/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out 
b/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out
new file mode 100644
index 0000000000..bebfc26854
--- /dev/null
+++ b/regression-test/data/external_table_emr_p2/hive/test_multi_langs.out
@@ -0,0 +1,148 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !01 --
+2      是
+
+-- !02 --
+1      
+2      是
+3      III类户
+
+-- !03 --
+2      1
+
+-- !04 --
+5      ありがとう
+
+-- !05 --
+1      你好
+2      谢谢
+3      再见
+4      こんにちは
+5      ありがとう
+6      さようなら
+7      안녕하세요
+8      감사합니다
+9      안녕히 가세요
+10     Hola
+11     Gracias
+12     Adiós
+13     Hallo
+14     Danke
+15     Auf Wiedersehen
+16     مرحبا
+17     شكرًا
+18     مع السلامة
+19     Bonjour
+20     Merci
+21     Au revoir
+22     Ciao
+23     Grazie
+24     Arrivederci
+25     Olá
+26     Obrigado
+27     Adeus
+28     Hello
+29     Thank you
+30     Goodbye
+
+-- !06 --
+5      1
+
+-- !01 --
+2      是
+
+-- !02 --
+1      
+2      是
+3      III类户
+
+-- !03 --
+2      1
+
+-- !04 --
+5      ありがとう
+
+-- !05 --
+1      你好
+2      谢谢
+3      再见
+4      こんにちは
+5      ありがとう
+6      さようなら
+7      안녕하세요
+8      감사합니다
+9      안녕히 가세요
+10     Hola
+11     Gracias
+12     Adiós
+13     Hallo
+14     Danke
+15     Auf Wiedersehen
+16     مرحبا
+17     شكرًا
+18     مع السلامة
+19     Bonjour
+20     Merci
+21     Au revoir
+22     Ciao
+23     Grazie
+24     Arrivederci
+25     Olá
+26     Obrigado
+27     Adeus
+28     Hello
+29     Thank you
+30     Goodbye
+
+-- !06 --
+5      1
+
+-- !01 --
+2      是
+
+-- !02 --
+1      
+2      是
+3      III类户
+
+-- !03 --
+2      1
+
+-- !04 --
+5      ありがとう
+
+-- !05 --
+1      你好
+2      谢谢
+3      再见
+4      こんにちは
+5      ありがとう
+6      さようなら
+7      안녕하세요
+8      감사합니다
+9      안녕히 가세요
+10     Hola
+11     Gracias
+12     Adiós
+13     Hallo
+14     Danke
+15     Auf Wiedersehen
+16     مرحبا
+17     شكرًا
+18     مع السلامة
+19     Bonjour
+20     Merci
+21     Au revoir
+22     Ciao
+23     Grazie
+24     Arrivederci
+25     Olá
+26     Obrigado
+27     Adeus
+28     Hello
+29     Thank you
+30     Goodbye
+
+-- !06 --
+5      1
+
diff --git 
a/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy 
b/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy
new file mode 100644
index 0000000000..937fd9039a
--- /dev/null
+++ b/regression-test/suites/external_table_emr_p2/hive/test_multi_langs.groovy
@@ -0,0 +1,61 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_multi_langs", "p2") {
+
+    def formats = ["_parquet", "_orc", "_text"]
+    def q1 = """select * from test_chineseSUFFIX where col1='是' order by id"""
+    def q2 = """select * from test_chineseSUFFIX order by id"""
+    def q3 = """select id, count(col1) from test_chineseSUFFIX where col1='是' 
group by id order by id"""
+    def q4 = """select * from test_multi_langsSUFFIX where col1='ありがとう' order 
by id"""
+    def q5 = """select * from test_multi_langsSUFFIX order by id"""
+    def q6 = """select id, count(col1) from test_multi_langsSUFFIX where 
col1='ありがとう' group by id order by id"""
+
+    String enabled = context.config.otherConfigs.get("enableExternalHiveTest")
+    if (enabled != null && enabled.equalsIgnoreCase("true")) {
+        try {
+            String extHiveHmsHost = 
context.config.otherConfigs.get("extHiveHmsHost")
+            String extHiveHmsPort = 
context.config.otherConfigs.get("extHiveHmsPort")
+            String catalog_name = "test_multi_langs"
+
+            sql """drop catalog if exists ${catalog_name};"""
+            sql """
+                create catalog if not exists ${catalog_name} properties (
+                    'type'='hms',
+                    'hive.metastore.uris' = 
'thrift://${extHiveHmsHost}:${extHiveHmsPort}'
+                );
+            """
+            logger.info("catalog " + catalog_name + " created")
+            sql """switch ${catalog_name};"""
+            logger.info("switched to catalog " + catalog_name)
+            sql """use multi_catalog;"""
+            logger.info("use multi_catalog")
+
+            for (String format in formats) {
+                logger.info("Process format " + format)
+                qt_01 q1.replace("SUFFIX", format)
+                qt_02 q2.replace("SUFFIX", format)
+                qt_03 q3.replace("SUFFIX", format)
+                qt_04 q4.replace("SUFFIX", format)
+                qt_05 q5.replace("SUFFIX", format)
+                qt_06 q6.replace("SUFFIX", format)
+            }
+            sql """drop catalog if exists ${catalog_name}"""
+        } finally {
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

[doris] branch master updated: [Fix](parquet-reader) Fix parquet string column min max statistics issue which caused query result incorrectly. (#21675)

Reply via email to