This is an automated email from the ASF dual-hosted git repository. kxiao pushed a commit to branch branch-2.0 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.0 by this push: new c3d81cb3a0d [opt](stats) Use escape rather than base64 for min/max value #27746 (#27748) c3d81cb3a0d is described below commit c3d81cb3a0d84db1b006c7499df4909bcc8277a5 Author: AKIRA <33112463+kikyou1...@users.noreply.github.com> AuthorDate: Wed Nov 29 20:59:02 2023 +0800 [opt](stats) Use escape rather than base64 for min/max value #27746 (#27748) --- .../org/apache/doris/statistics/BaseAnalysisTask.java | 6 +++--- .../java/org/apache/doris/statistics/ColStatsData.java | 16 +++------------- .../org/apache/doris/statistics/ColumnStatistic.java | 4 ---- .../org/apache/doris/statistics/OlapAnalysisTask.java | 4 ++-- .../apache/doris/statistics/StatisticsRepository.java | 4 ++-- .../org/apache/doris/statistics/util/StatisticsUtil.java | 4 +++- .../apache/doris/statistics/BaseAnalysisTaskTest.java | 4 ++-- .../apache/doris/statistics/OlapAnalysisTaskTest.java | 6 +++--- .../java/org/apache/doris/statistics/StatsMockUtil.java | 2 +- .../apache/doris/statistics/util/StatisticsUtilTest.java | 8 ++++++++ 10 files changed, 27 insertions(+), 31 deletions(-) diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java index 824e3f74abd..4c0e07ce6cb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/BaseAnalysisTask.java @@ -252,7 +252,7 @@ public abstract class BaseAnalysisTask { protected String getMinFunction() { if (tableSample == null) { - return "to_base64(CAST(MIN(`${colName}`) as ${type})) "; + return "CAST(MIN(`${colName}`) as ${type}) "; } else { // Min value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. return "NULL"; @@ -276,7 +276,7 @@ public abstract class BaseAnalysisTask { // Max value is not accurate while sample, so set it to NULL to avoid optimizer generate bad plan. protected String getMaxFunction() { if (tableSample == null) { - return "to_base64(CAST(MAX(`${colName}`) as ${type})) "; + return "CAST(MAX(`${colName}`) as ${type}) "; } else { return "NULL"; } @@ -315,7 +315,7 @@ public abstract class BaseAnalysisTask { long startTime = System.currentTimeMillis(); try (AutoCloseConnectContext a = StatisticsUtil.buildConnectContext()) { stmtExecutor = new StmtExecutor(a.connectContext, sql); - ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0), needEncode); + ColStatsData colStatsData = new ColStatsData(stmtExecutor.executeInternalQuery().get(0)); job.appendBuf(this, Collections.singletonList(colStatsData)); } finally { LOG.debug("End cost time in secs: " + (System.currentTimeMillis() - startTime) / 1000); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java index fa7546abd5b..7878a065488 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColStatsData.java @@ -21,8 +21,6 @@ import org.apache.doris.statistics.util.StatisticsUtil; import com.google.common.annotations.VisibleForTesting; -import java.nio.charset.StandardCharsets; -import java.util.Base64; import java.util.StringJoiner; /** @@ -56,8 +54,6 @@ public class ColStatsData { public final String updateTime; - public final boolean needEncode; - @VisibleForTesting public ColStatsData() { statsId = new StatsId(); @@ -68,10 +64,9 @@ public class ColStatsData { maxLit = null; dataSizeInBytes = 0; updateTime = null; - needEncode = true; } - public ColStatsData(ResultRow row, boolean needEncode) { + public ColStatsData(ResultRow row) { this.statsId = new StatsId(row); this.count = (long) Double.parseDouble(row.get(7)); this.ndv = (long) Double.parseDouble(row.getWithDefault(8, "0")); @@ -80,7 +75,6 @@ public class ColStatsData { this.maxLit = row.get(11); this.dataSizeInBytes = (long) Double.parseDouble(row.getWithDefault(12, "0")); this.updateTime = row.get(13); - this.needEncode = needEncode; } public String toSQL(boolean roundByParentheses) { @@ -93,12 +87,8 @@ public class ColStatsData { sj.add(String.valueOf(count)); sj.add(String.valueOf(ndv)); sj.add(String.valueOf(nullCount)); - sj.add(minLit == null ? "NULL" : needEncode - ? "'" + Base64.getEncoder().encodeToString(minLit.getBytes(StandardCharsets.UTF_8)) + "'" - : "'" + minLit + "'"); - sj.add(maxLit == null ? "NULL" : needEncode - ? "'" + Base64.getEncoder().encodeToString(maxLit.getBytes(StandardCharsets.UTF_8)) + "'" - : "'" + maxLit + "'"); + sj.add(minLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(minLit) + "'"); + sj.add(maxLit == null ? "NULL" : "'" + StatisticsUtil.escapeSQL(maxLit) + "'"); sj.add(String.valueOf(dataSizeInBytes)); sj.add(StatisticsUtil.quote(updateTime)); return sj.toString(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java index eefaf7badeb..5ea1f9097bb 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/ColumnStatistic.java @@ -31,8 +31,6 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.json.JSONObject; -import java.nio.charset.StandardCharsets; -import java.util.Base64; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -175,7 +173,6 @@ public class ColumnStatistic { String min = row.get(10); String max = row.get(11); if (min != null && !min.equalsIgnoreCase("NULL")) { - min = new String(Base64.getDecoder().decode(min), StandardCharsets.UTF_8); // Internal catalog get the min/max value using a separate SQL, // and the value is already encoded by base64. Need to handle internal and external catalog separately. if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && min.equalsIgnoreCase("NULL")) { @@ -193,7 +190,6 @@ public class ColumnStatistic { columnStatisticBuilder.setMinValue(Double.NEGATIVE_INFINITY); } if (max != null && !max.equalsIgnoreCase("NULL")) { - max = new String(Base64.getDecoder().decode(max), StandardCharsets.UTF_8); if (catalogId != InternalCatalog.INTERNAL_CATALOG_ID && max.equalsIgnoreCase("NULL")) { columnStatisticBuilder.setMaxValue(Double.POSITIVE_INFINITY); } else { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java index 04a763b6884..45ccdbb0d68 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/OlapAnalysisTask.java @@ -92,8 +92,8 @@ public class OlapAnalysisTask extends BaseAnalysisTask { // Get basic stats, including min and max. ResultRow basicStats = collectBasicStat(r); long rowCount = tbl.getRowCount(); - String min = StatisticsUtil.encodeValue(basicStats, 0); - String max = StatisticsUtil.encodeValue(basicStats, 1); + String min = StatisticsUtil.escapeSQL(basicStats.get(0)); + String max = StatisticsUtil.escapeSQL(basicStats.get(1)); boolean limitFlag = false; long rowsToSample = pair.second; diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java index 12ca6b4aa1f..29e11ac75ad 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/StatisticsRepository.java @@ -269,8 +269,8 @@ public class StatisticsRepository { params.put("count", String.valueOf(columnStatistic.count)); params.put("ndv", String.valueOf(columnStatistic.ndv)); params.put("nullCount", String.valueOf(columnStatistic.numNulls)); - params.put("min", StatisticsUtil.encodeString(min)); - params.put("max", StatisticsUtil.encodeString(max)); + params.put("min", StatisticsUtil.escapeSQL(min)); + params.put("max", StatisticsUtil.escapeSQL(max)); params.put("dataSize", String.valueOf(columnStatistic.dataSize)); if (partitionIds.isEmpty()) { diff --git a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java index 66b13ff87f1..3f9abcad5f9 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java +++ b/fe/fe-core/src/main/java/org/apache/doris/statistics/util/StatisticsUtil.java @@ -786,7 +786,9 @@ public class StatisticsUtil { if (str == null) { return null; } - return org.apache.commons.lang3.StringUtils.replace(str, "'", "''"); + return str.replace("'", "''") + .replace("\\", "\\\\") + .replace("\"", "\"\""); } public static boolean isExternalTable(String catalogName, String dbName, String tblName) { diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java index fe81c055e0d..187c4d207df 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/BaseAnalysisTaskTest.java @@ -42,14 +42,14 @@ public class BaseAnalysisTaskTest { Assertions.assertEquals("SUM(t1.count) * 4", dataSizeFunction); String minFunction = olapAnalysisTask.getMinFunction(); - Assertions.assertEquals("to_base64(CAST(MIN(`${colName}`) as ${type})) ", minFunction); + Assertions.assertEquals("CAST(MIN(`${colName}`) as ${type}) ", minFunction); olapAnalysisTask.tableSample = new TableSample(true, 20L); minFunction = olapAnalysisTask.getMinFunction(); Assertions.assertEquals("NULL", minFunction); olapAnalysisTask.tableSample = null; String maxFunction = olapAnalysisTask.getMaxFunction(); - Assertions.assertEquals("to_base64(CAST(MAX(`${colName}`) as ${type})) ", maxFunction); + Assertions.assertEquals("CAST(MAX(`${colName}`) as ${type}) ", maxFunction); olapAnalysisTask.tableSample = new TableSample(true, 20L); maxFunction = olapAnalysisTask.getMaxFunction(); Assertions.assertEquals("NULL", maxFunction); diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java index c174795b36b..7b7894e54b4 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/OlapAnalysisTaskTest.java @@ -151,7 +151,7 @@ public class OlapAnalysisTaskTest { @Mock public void runQuery(String sql, boolean needEncode) { Assertions.assertFalse(needEncode); - Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'M [...] + Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...] return; } }; @@ -218,7 +218,7 @@ public class OlapAnalysisTaskTest { @Mock public void runQuery(String sql, boolean needEncode) { Assertions.assertFalse(needEncode); - Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, 'MQ==' AS `min`, 'Mg==' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit [...] + Assertions.assertEquals(" SELECT CONCAT(30001, '-', -1, '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, ROUND(NDV(`${colName}`) * 5.0) as `ndv`, ROUND(SUM(CASE WHEN `${colName}` IS NULL THEN 1 ELSE 0 END) * 5.0) AS `null_count`, '1' AS `min`, '2' AS `max`, SUM(LENGTH(`${colName}`)) * 5.0 AS `data_size`, NOW() FROM `catalogName`.`${dbName}`.`${tblName}` limit 100", sql); return; } }; @@ -292,7 +292,7 @@ public class OlapAnalysisTaskTest { @Mock public void runQuery(String sql, boolean needEncode) { Assertions.assertFalse(needEncode); - Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, 'MQ==' AS `min`, 'M [...] + Assertions.assertEquals("SELECT CONCAT('30001', '-', '-1', '-', 'null') AS `id`, 10001 AS `catalog_id`, 20001 AS `db_id`, 30001 AS `tbl_id`, -1 AS `idx_id`, 'null' AS `col_id`, NULL AS `part_id`, 500 AS `row_count`, SUM(`t1`.`count`) * COUNT(1) / (SUM(`t1`.`count`) - SUM(IF(`t1`.`count` = 1, 1, 0)) + SUM(IF(`t1`.`count` = 1, 1, 0)) * SUM(`t1`.`count`) / 500) as `ndv`, IFNULL(SUM(IF(`t1`.`column_key` IS NULL, `t1`.`count`, 0)), 0) * 5.0 as `null_count`, '1' AS `min`, '2' A [...] return; } }; diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java index 84e1112d216..45c78f665b0 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/StatsMockUtil.java @@ -40,7 +40,7 @@ public class StatsMockUtil { add("0"); add("10"); // 11 - add("MTE="); + add("11"); add("12"); add(String.valueOf(System.currentTimeMillis())); }}; diff --git a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java index 2c0854dcf21..c827a7d1690 100644 --- a/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java +++ b/fe/fe-core/src/test/java/org/apache/doris/statistics/util/StatisticsUtilTest.java @@ -142,4 +142,12 @@ public class StatisticsUtilTest { .encodeToString("a".getBytes(StandardCharsets.UTF_8)), StatisticsUtil.encodeValue(row, 1)); Assertions.assertEquals("NULL", StatisticsUtil.encodeValue(row, 2)); } + + @Test + public void testEscape() { + // \'" + String origin = "\\'\""; + // \\''"" + Assertions.assertEquals("\\\\''\"\"", StatisticsUtil.escapeSQL(origin)); + } } --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org