This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 35089583cf8 [fix](variant) Handle truncated sparse path stats when 
reading variant (#64205)
35089583cf8 is described below

commit 35089583cf80e5117fc4cb2917b1daf8ce38a7e2
Author: lihangyu <[email protected]>
AuthorDate: Wed Jun 17 11:05:51 2026 +0800

    [fix](variant) Handle truncated sparse path stats when reading variant 
(#64205)
---
 .../segment/variant/variant_column_reader.cpp      | 17 +++++++++------
 .../apache/doris/common/util/PropertyAnalyzer.java |  8 ++++----
 .../java/org/apache/doris/qe/SessionVariable.java  |  9 ++++++++
 .../apache/doris/common/PropertyAnalyzerTest.java  | 22 ++++++++++++++++++++
 .../ddl/create_nestedtypes_with_schemachange.out   | 12 +++++------
 .../test_variant_compaction_with_sparse_limit.out  |  6 +++---
 .../create_nestedtypes_with_schemachange.groovy    |  2 +-
 ...est_variant_compaction_with_sparse_limit.groovy | 24 ++++++++++++++++++++--
 8 files changed, 78 insertions(+), 22 deletions(-)

diff --git a/be/src/storage/segment/variant/variant_column_reader.cpp 
b/be/src/storage/segment/variant/variant_column_reader.cpp
index ded364b33c2..df9561ada7c 100644
--- a/be/src/storage/segment/variant/variant_column_reader.cpp
+++ b/be/src/storage/segment/variant/variant_column_reader.cpp
@@ -192,9 +192,9 @@ bool VariantColumnReader::is_exceeded_sparse_column_limit() 
const {
 }
 
 bool VariantColumnReader::_is_exceeded_sparse_column_limit_unlocked() const {
-    bool exceeded_sparse_column_limit = 
!_statistics->sparse_column_non_null_size.empty() &&
-                                        
_statistics->sparse_column_non_null_size.size() >=
-                                                
_variant_sparse_column_statistics_size;
+    const bool exceeded_sparse_column_limit = 
!_statistics->sparse_column_non_null_size.empty() &&
+                                              
_statistics->sparse_column_non_null_size.size() >=
+                                                      
_variant_sparse_column_statistics_size;
     DBUG_EXECUTE_IF("exceeded_sparse_column_limit_must_be_false", {
         if (exceeded_sparse_column_limit) {
             throw doris::Exception(
@@ -882,8 +882,12 @@ Status VariantColumnReader::_build_read_plan(ReadPlan* 
plan, const TabletColumn&
     }
 
     // Check if path is prefix, example sparse columns path: a.b.c, a.b.e, 
access prefix: a.b.
-    // Or access root path
-    if (_has_prefix_path_unlocked(relative_path)) {
+    // Or access root path. If sparse stats reached the configured limit, an 
exact sparse path can
+    // still have unrecorded sparse children such as a.b.c.
+    const bool has_prefix_path = _has_prefix_path_unlocked(relative_path);
+    const bool sparse_stats_may_have_unrecorded_children =
+            exceeded_sparse_column_limit && existed_in_sparse_column;
+    if (has_prefix_path || sparse_stats_may_have_unrecorded_children) {
         // Example {"b" : {"c":456,"e":7.111}}
         // b.c is sparse column, b.e is subcolumn, so b is both the prefix of 
sparse column and
         // subcolumn
@@ -951,7 +955,8 @@ Status VariantColumnReader::_build_read_plan(ReadPlan* 
plan, const TabletColumn&
         }
 
         if (exceeded_sparse_column_limit) {
-            // maybe exist prefix path in sparse column
+            // Sparse stats are truncated, so a missing exact sparse path does 
not prove that the
+            // path is absent. It may still be nested under a recorded sparse 
object.
             plan->kind = ReadKind::HIERARCHICAL;
             plan->type = create_variant_type(target_col);
             plan->relative_path = relative_path;
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
index 2e776e264bf..b27db96bbe1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/common/util/PropertyAnalyzer.java
@@ -2118,12 +2118,12 @@ public class PropertyAnalyzer {
                     
properties.get(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE);
             try {
                 maxSparseColumnStatisticsSize = 
Integer.parseInt(maxSparseColumnStatisticsSizeStr);
-                if (maxSparseColumnStatisticsSize < 0 || 
maxSparseColumnStatisticsSize > 50000) {
-                    throw new 
AnalysisException("variant_max_sparse_column_statistics_size must between 0 and 
50000 ");
-                }
-            } catch (Exception e) {
+            } catch (NumberFormatException e) {
                 throw new 
AnalysisException("variant_max_sparse_column_statistics_size format error:" + 
e.getMessage());
             }
+            if (maxSparseColumnStatisticsSize < 1 || 
maxSparseColumnStatisticsSize > 50000) {
+                throw new 
AnalysisException("variant_max_sparse_column_statistics_size must between 1 and 
50000 ");
+            }
 
             
properties.remove(PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE);
         }
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index c9a17bc7527..5eacd86e5b0 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -3597,6 +3597,7 @@ public class SessionVariable implements Serializable, 
Writable {
     @VarAttrDef.VarAttr(
             name = DEFAULT_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
             needForward = true,
+            checker = "checkDefaultVariantMaxSparseColumnStatisticsSize",
             fuzzy = true
     )
     public int defaultVariantMaxSparseColumnStatisticsSize = 10000;
@@ -6446,6 +6447,14 @@ public class SessionVariable implements Serializable, 
Writable {
         }
     }
 
+    public void checkDefaultVariantMaxSparseColumnStatisticsSize(String 
variantMaxSparseColumnStatisticsSize) {
+        int value = Integer.valueOf(variantMaxSparseColumnStatisticsSize);
+        if (value < 1 || value > 50000) {
+            throw new UnsupportedOperationException("variant max sparse column 
statistics size is: "
+                    + variantMaxSparseColumnStatisticsSize + " it must between 
1 and 50000");
+        }
+    }
+
     public void checkHnswEfSearch(String efSearch) {
         int value = Integer.valueOf(efSearch);
         if (value < 1) {
diff --git 
a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java 
b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
index bd821e4d377..81878c9b1a3 100644
--- a/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
+++ b/fe/fe-core/src/test/java/org/apache/doris/common/PropertyAnalyzerTest.java
@@ -28,6 +28,7 @@ import org.apache.doris.catalog.ScalarType;
 import org.apache.doris.catalog.Type;
 import org.apache.doris.common.util.PropertyAnalyzer;
 import org.apache.doris.common.util.TimeUtils;
+import org.apache.doris.qe.SessionVariable;
 import org.apache.doris.resource.Tag;
 import org.apache.doris.thrift.TInvertedIndexFileStorageFormat;
 import org.apache.doris.thrift.TStorageFormat;
@@ -345,6 +346,19 @@ public class PropertyAnalyzerTest {
     @Test
     public void testAnalyzeVariantMaxSparseColumnStatisticsSize() throws 
AnalysisException {
         Map<String, String> properties = Maps.newHashMap();
+        
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
 "0");
+        try {
+            
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0);
+            Assertions.fail("Expected AnalysisException was not thrown");
+        } catch (AnalysisException e) {
+            Assertions.assertNotNull(e.getMessage());
+        }
+        properties.clear();
+        
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
 "1");
+        Assertions.assertEquals(1, 
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0));
+        Assertions.assertFalse(properties.containsKey(
+                
PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE));
+        properties.clear();
         
properties.put(PropertyAnalyzer.PROPERTIES_VARIANT_MAX_SPARSE_COLUMN_STATISTICS_SIZE,
 "-1");
         try {
             
PropertyAnalyzer.analyzeVariantMaxSparseColumnStatisticsSize(properties, 0);
@@ -370,6 +384,14 @@ public class PropertyAnalyzerTest {
         }
     }
 
+    @Test
+    public void testCheckDefaultVariantMaxSparseColumnStatisticsSize() {
+        SessionVariable sessionVariable = new SessionVariable();
+        Assertions.assertThrows(UnsupportedOperationException.class,
+                () -> 
sessionVariable.checkDefaultVariantMaxSparseColumnStatisticsSize("0"));
+        sessionVariable.checkDefaultVariantMaxSparseColumnStatisticsSize("1");
+    }
+
     @Test
     public void testAnalyzeSequenceMap() throws AnalysisException {
         List<Column> columns = Lists.newArrayList();
diff --git 
a/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out
 
b/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out
index 86cce3569f7..725ed40d971 100644
--- 
a/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out
+++ 
b/regression-test/data/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.out
@@ -53,7 +53,7 @@ col2  int     No      false   \N      NONE
 col3   array<int>      Yes     false   \N      NONE
 col4   map<int,int>    Yes     false   \N      NONE
 col5   struct<f1:int>  Yes     false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
@@ -67,7 +67,7 @@ col2  int     No      false   \N      NONE
 col3   array<int>      Yes     false   \N      NONE
 col4   map<int,int>    No      false   \N      NONE
 col5   struct<f1:int>  No      false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
@@ -81,7 +81,7 @@ col2  int     No      false   \N      NONE
 col3   array<int>      Yes     false   \N      NONE
 col4   map<int,int>    Yes     false   \N      NONE
 col5   struct<f1:int>  Yes     false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
@@ -95,7 +95,7 @@ col2  int     No      false   \N      NONE
 col3   array<int>      No      false   \N      NONE
 col4   map<int,int>    Yes     false   \N      NONE
 col5   struct<f1:int>  No      false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
@@ -109,7 +109,7 @@ col2        int     No      false   \N      NONE
 col3   array<int>      Yes     false   \N      NONE
 col4   map<int,int>    Yes     false   \N      NONE
 col5   struct<f1:int>  Yes     false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   Yes     false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
@@ -123,7 +123,7 @@ col2        int     No      false   \N      NONE
 col3   array<int>      No      false   \N      NONE
 col4   map<int,int>    No      false   \N      NONE
 col5   struct<f1:int>  Yes     false   \N      NONE
-col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"0","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
+col6   variant<PROPERTIES ("variant_max_subcolumns_count" = 
"0","variant_enable_typed_paths_to_sparse" = 
"false","variant_max_sparse_column_statistics_size" = 
"1","variant_sparse_hash_shard_count" = "1")>   No      false   \N      NONE
 
 -- !sql_after --
 1      2       [1, 2]  {1:2}   {"f1":1}        {"a":[1, 2, 3]}
diff --git 
a/regression-test/data/variant_p0/predefine/test_variant_compaction_with_sparse_limit.out
 
b/regression-test/data/variant_p0/predefine/test_variant_compaction_with_sparse_limit.out
index 42862a35f10..6caed66d80f 100644
--- 
a/regression-test/data/variant_p0/predefine/test_variant_compaction_with_sparse_limit.out
+++ 
b/regression-test/data/variant_p0/predefine/test_variant_compaction_with_sparse_limit.out
@@ -34,8 +34,8 @@
 16     {"a":"1223"}
 17     {"a":[1]}
 17     {"a":[1]}
-18     {"a":["1",2,1.1]}
-18     {"a":["1",2,1.1]}
+18     {"a":["1", 2, 1.1]}
+18     {"a":["1", 2, 1.1]}
 19     {"a":1,"b":{"c":1}}
 19     {"a":1,"b":{"c":1}}
 20     {"a":1,"b":{"c":[{"a":1}]}}
@@ -193,7 +193,7 @@
 15     {"a":1}
 16     {"a":"1223"}
 17     {"a":[1]}
-18     {"a":["1",2,1.1]}
+18     {"a":["1", 2, 1.1]}
 19     {"a":1,"b":{"c":1}}
 20     {"a":1,"b":{"c":[{"a":1}]}}
 21     {"a":1,"b":{"c":[{"a":1}]}}
diff --git 
a/regression-test/suites/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.groovy
 
b/regression-test/suites/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.groovy
index 138c4914c3f..d4333ccb72f 100644
--- 
a/regression-test/suites/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.groovy
+++ 
b/regression-test/suites/datatype_p0/nested_types/ddl/create_nestedtypes_with_schemachange.groovy
@@ -20,7 +20,7 @@ suite("create_nestedtypes_with_schemachange", "p0") {
         // create basic type
         sql "set default_variant_max_subcolumns_count = 0"
         sql "set default_variant_enable_typed_paths_to_sparse = false"
-        sql "set default_variant_max_sparse_column_statistics_size = 0"
+        sql "set default_variant_max_sparse_column_statistics_size = 1"
         sql "set default_variant_sparse_hash_shard_count = 0"
         sql "set default_variant_enable_doc_mode = false"
         sql "DROP TABLE IF EXISTS $testTablex"
diff --git 
a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
 
b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
index 1a51f065c0b..5660171b5fa 100644
--- 
a/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
+++ 
b/regression-test/suites/variant_p0/predefine/test_variant_compaction_with_sparse_limit.groovy
@@ -41,7 +41,24 @@ suite("test_compaction_variant_predefine_with_sparse_limit", 
"nonConcurrent") {
         }
 
         int max_sparse_column_statistics_size = 2
-        def create_table = { tableName, buckets="auto", key_type="DUPLICATE" ->
+        test {
+            sql """ set default_variant_max_sparse_column_statistics_size = 0 
"""
+            exception "variant max sparse column statistics size"
+        }
+        sql "DROP TABLE IF EXISTS variant_sparse_stats_zero"
+        test {
+            sql """
+                CREATE TABLE variant_sparse_stats_zero (
+                    k bigint,
+                    v variant 
<properties("variant_max_sparse_column_statistics_size" = "0")>
+                )
+                DUPLICATE KEY(`k`)
+                DISTRIBUTED BY HASH(k) BUCKETS 1
+                properties("replication_num" = "1");
+            """
+            exception "variant_max_sparse_column_statistics_size must between 
1 and 50000"
+        }
+        def create_table = { tableName, buckets="auto", key_type="DUPLICATE", 
max_subcolumns_count=2048 ->
             sql "DROP TABLE IF EXISTS ${tableName}"
             def var_def = "variant <MATCH_NAME 'sala' : int, MATCH_NAME 'ddd' 
: double, MATCH_NAME 'z' : double, 
properties(\"variant_max_sparse_column_statistics_size\" = 
\"${max_sparse_column_statistics_size}\")>"
             if (key_type == "AGGREGATE") {
@@ -60,13 +77,16 @@ 
suite("test_compaction_variant_predefine_with_sparse_limit", "nonConcurrent") {
             def create_tbl_res = sql """ show create table ${tableName} """
             logger.info("${create_tbl_res}")
             
assertTrue(create_tbl_res.toString().contains("variant_max_sparse_column_statistics_size"))
+            
assertTrue(create_tbl_res.toString().contains("\"variant_max_subcolumns_count\" 
= \"${max_subcolumns_count}\""))
         }
         def key_types = ["DUPLICATE", "UNIQUE", "AGGREGATE"]
         // def key_types = ["AGGREGATE"]
         for (int i = 0; i < key_types.size(); i++) {
+            def max_subcolumns_count = key_types[i] == "AGGREGATE" ? 2048 : 1
+            sql """ set default_variant_max_subcolumns_count = 
${max_subcolumns_count} """
             def tableName = "simple_variant_${key_types[i]}"
             // 1. simple cases
-            create_table.call(tableName, "1", key_types[i])
+            create_table.call(tableName, "1", key_types[i], 
max_subcolumns_count)
             def insert1 = {
                 sql """insert into ${tableName} values (1,  '{"x" : 
[1]}'),(13,  '{"a" : 1}');"""
                 sql """insert into ${tableName} values (2,  '{"a" : 
"1"}'),(14,  '{"a" : [[[1]]]}');"""


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to