This is an automated email from the ASF dual-hosted git repository. lijibing pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new 31603089082 Limit the max string length to 1024 while collecting column stats to control BE memory usage. (#32470) (#33084) 31603089082 is described below commit 31603089082ca2a511227473727a6756f9b85e1d Author: Jibing-Li <64681310+jibing...@users.noreply.github.com> AuthorDate: Mon Apr 1 12:01:23 2024 +0800 Limit the max string length to 1024 while collecting column stats to control BE memory usage. (#32470) (#33084) --- .../apache/doris/statistics/BaseAnalysisTask.java | 24 ++++++++++++++++++++++ .../apache/doris/statistics/HMSAnalysisTask.java | 6 +++++- .../apache/doris/statistics/OlapAnalysisTask.java | 6 +++++- .../doris/statistics/OlapAnalysisTaskTest.java | 12 +++++------ 4 files changed, 40 insertions(+), 8 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 68767843507..f871e8761a5 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -82,6 +82,30 @@ public abstract class BaseAnalysisTask { + "NOW() " + "FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} ${sampleHints} ${limit}"; + protected static final String DUJ1_ANALYZE_STRING_TEMPLATE = "SELECT " + + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + + "${catalogId} AS `catalog_id`, " + + "${dbId} AS `db_id`, " + + "${tblId} AS `tbl_id`, " + + "${idxId} AS `idx_id`, " + + "'${colId}' AS `col_id`, " + + "NULL AS `part_id`, " + + "${rowCount} AS `row_count`, " + + "${ndvFunction} as `ndv`, " + + "IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * ${scaleFactor} as `null_count`, " + + "SUBSTRING(CAST(${min} AS STRING), 1, 1024) AS `min`, " + + "SUBSTRING(CAST(${max} AS STRING), 1, 1024) AS `max`, " + + "${dataSizeFunction} * ${scaleFactor} AS `data_size`, " + + "NOW() " + + "FROM ( " + + " SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` " + + " FROM " + + " (SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` " + + " FROM `${catalogName}`.`${dbName}`.`${tblName}` ${index} " + + " ${sampleHints} ${limit}) as `t0` " + + " GROUP BY `t0`.`colValue` " + + ") as `t1` "; + protected static final String DUJ1_ANALYZE_TEMPLATE = "SELECT " + "CONCAT('${tblId}', '-', '${idxId}', '-', '${colId}') AS `id`, " + "${catalogId} AS `catalog_id`, " diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java index 1fe827420c9..a90308a8000 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/HMSAnalysisTask.java @@ -148,7 +148,11 @@ public class HMSAnalysisTask extends BaseAnalysisTask { params.put("ndvFunction", "ROUND(NDV(`${colName}`) * ${scaleFactor})"); params.put("rowCount", "ROUND(count(1) * ${scaleFactor})"); } else { - sb.append(DUJ1_ANALYZE_TEMPLATE); + if (col.getType().isStringType()) { + sb.append(DUJ1_ANALYZE_STRING_TEMPLATE); + } else { + sb.append(DUJ1_ANALYZE_TEMPLATE); + } params.put("dataSizeFunction", getDataSizeFunction(col, true)); params.put("ndvFunction", getNdvFunction("ROUND(SUM(t1.count) * ${scaleFactor})")); params.put("rowCount", "ROUND(SUM(t1.count) * ${scaleFactor})"); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index e6dd46e9fc7..60a3528afc8 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -163,7 +163,11 @@ public class OlapAnalysisTask extends BaseAnalysisTask { sql = stringSubstitutor.replace(LINEAR_ANALYZE_TEMPLATE); } else { params.put("dataSizeFunction", getDataSizeFunction(col, true)); - sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); + if (col.getType().isStringType()) { + sql = stringSubstitutor.replace(DUJ1_ANALYZE_STRING_TEMPLATE); + } else { + sql = stringSubstitutor.replace(DUJ1_ANALYZE_TEMPLATE); + } } LOG.info("Sample for column [{}]. Total rows [{}], rows to sample [{}], scale factor [{}], " + "limited [{}], distribute column [{}], partition column [{}], key column [{}], " diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index 5d9d57406a3..75506b1c85a 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -159,11 +159,10 @@ public class OlapAnalysisTaskTest { + " IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, " + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`," + " SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " - + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " + + "SUM(t1.count) * 4 * 5.0 AS `data_size`, NOW() " + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) " - + "as `count` FROM (SELECT `${colName}` FROM " - + "`catalogName`.`${dbName}`.`${tblName}` " - + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + + "as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}`" + + " limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); return; } }; @@ -183,7 +182,7 @@ public class OlapAnalysisTaskTest { }; OlapAnalysisTask olapAnalysisTask = new OlapAnalysisTask(); - olapAnalysisTask.col = new Column("test", PrimitiveType.STRING); + olapAnalysisTask.col = new Column("test", PrimitiveType.INT); olapAnalysisTask.tbl = tableIf; AnalysisInfoBuilder analysisInfoBuilder = new AnalysisInfoBuilder(); analysisInfoBuilder.setJobType(AnalysisInfo.JobType.MANUAL); @@ -322,7 +321,8 @@ public class OlapAnalysisTaskTest { + "SUBSTRING(CAST('1' AS STRING), 1, 1024) AS `min`, " + "SUBSTRING(CAST('2' AS STRING), 1, 1024) AS `max`, " + "SUM(LENGTH(`column_key`) * count) * 5.0 AS `data_size`, NOW() " - + "FROM ( SELECT t0.`${colName}` as `column_key`, COUNT(1) as `count` FROM (SELECT `${colName}` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`${colName}` ) as `t1` ", sql); + + "FROM ( SELECT t0.`colValue` as `column_key`, COUNT(1) as `count` FROM " + + "(SELECT SUBSTRING(CAST(`${colName}` AS STRING), 1, 1024) AS `colValue` FROM `catalogName`.`${dbName}`.`${tblName}` limit 100) as `t0` GROUP BY `t0`.`colValue` ) as `t1` ", sql); return; } }; --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org