This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new b0c5250bf9 [Enhancement](tvf) support trim_double_quotes and skip_lines for S3 and HDFS table valued function (#17224) b0c5250bf9 is described below commit b0c5250bf9624f0d6f5438e405462530157fdffb Author: gitccl <60637740+git...@users.noreply.github.com> AuthorDate: Wed Mar 1 23:41:31 2023 +0800 [Enhancement](tvf) support trim_double_quotes and skip_lines for S3 and HDFS table valued function (#17224) support trim_double_quotes and skip_lines for S3 and HDFS table valued function --- docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md | 5 +++++ docs/en/docs/sql-manual/sql-functions/table-functions/s3.md | 5 +++++ .../docs/sql-manual/sql-functions/table-functions/hdfs.md | 5 +++++ .../zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md | 5 +++++ .../doris/tablefunction/ExternalFileTableValuedFunction.java | 11 ++++++++++- 5 files changed, 30 insertions(+), 1 deletion(-) diff --git a/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md b/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md index 7742f676f4..8c671f7d14 100644 --- a/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md +++ b/docs/en/docs/sql-manual/sql-functions/table-functions/hdfs.md @@ -76,6 +76,11 @@ File format parameters: - `num_as_string`: (optional) default `false` - `fuzzy_parse`: (optional) default `false` + <version since="dev">The following 2 parameters are used for loading in csv format</version> + +- `trim_double_quotes`: Boolean type (optional), the default value is `false`. True means that the outermost double quotes of each field in the csv file are trimmed. +- `skip_lines`: Integer type (optional), the default value is 0. It will skip some lines in the head of csv file. It will be disabled when the format is `csv_with_names` or `csv_with_names_and_types`. + ### Examples Read and access csv format files on hdfs storage. diff --git a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md index 488a91020c..5e716d6980 100644 --- a/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/en/docs/sql-manual/sql-functions/table-functions/s3.md @@ -78,6 +78,11 @@ file format parameter: - `num_as_string`: (optional) default `"false"` - `fuzzy_parse`: (optional) default `"false"` + <version since="dev">The following 2 parameters are used for loading in csv format</version> + +- `trim_double_quotes`: Boolean type (optional), the default value is `false`. True means that the outermost double quotes of each field in the csv file are trimmed. +- `skip_lines`: Integer type (optional), the default value is 0. It will skip some lines in the head of csv file. It will be disabled when the format is `csv_with_names` or `csv_with_names_and_types`. + ### Example Read and access csv format files on S3-compatible object storage. diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md index 5ac5061240..3929579e42 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/hdfs.md @@ -78,6 +78,11 @@ hdfs( - `num_as_string`: (选填) 默认为 `false` - `fuzzy_parse`: (选填) 默认为 `false` + <version since="dev">下面2个参数是用于csv格式的导入</version> + +- `trim_double_quotes`: 布尔类型,选填,默认值为 `false`,为 `true` 时表示裁剪掉 csv 文件每个字段最外层的双引号 +- `skip_lines`: 整数类型,选填,默认值为0,含义为跳过csv文件的前几行。当设置format设置为 `csv_with_names` 或 `csv_with_names_and_types` 时,该参数会失效 + ### Examples 读取并访问 HDFS 存储上的csv格式文件 diff --git a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md index 542bfeb8b4..6854c92f6a 100644 --- a/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md +++ b/docs/zh-CN/docs/sql-manual/sql-functions/table-functions/s3.md @@ -80,6 +80,11 @@ S3 tvf中的每一个参数都是一个 `"key"="value"` 对。 - `num_as_string`: (选填) 默认为 `false` - `fuzzy_parse`: (选填) 默认为 `false` + <version since="dev">下面2个参数是用于csv格式的导入</version> + +- `trim_double_quotes`: 布尔类型,选填,默认值为 `false`,为 `true` 时表示裁剪掉 csv 文件每个字段最外层的双引号 +- `skip_lines`: 整数类型,选填,默认值为0,含义为跳过csv文件的前几行。当设置format设置为 `csv_with_names` 或 `csv_with_names_and_types` 时,该参数会失效 + ### Example 读取并访问 S3 兼容的对象存储上的csv格式文件 diff --git a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java index adfbeceafb..3cc34f34e7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java +++ b/fe/fe-core/src/main/java/org/apache/doris/tablefunction/ExternalFileTableValuedFunction.java @@ -80,6 +80,8 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio protected static final String READ_JSON_BY_LINE = "read_json_by_line"; protected static final String NUM_AS_STRING = "num_as_string"; protected static final String FUZZY_PARSE = "fuzzy_parse"; + protected static final String TRIM_DOUBLE_QUOTES = "trim_double_quotes"; + protected static final String SKIP_LINES = "skip_lines"; protected static final ImmutableSet<String> FILE_FORMAT_PROPERTIES = new ImmutableSet.Builder<String>() .add(FORMAT) @@ -91,6 +93,8 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio .add(FUZZY_PARSE) .add(COLUMN_SEPARATOR) .add(LINE_DELIMITER) + .add(TRIM_DOUBLE_QUOTES) + .add(SKIP_LINES) .build(); @@ -109,7 +113,8 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio private boolean readJsonByLine; private boolean numAsString; private boolean fuzzyParse; - + private boolean trimDoubleQuotes; + private int skipLines; public abstract TFileType getTFileType(); @@ -180,6 +185,8 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio stripOuterArray = Boolean.valueOf(validParams.get(STRIP_OUTER_ARRAY)).booleanValue(); numAsString = Boolean.valueOf(validParams.get(NUM_AS_STRING)).booleanValue(); fuzzyParse = Boolean.valueOf(validParams.get(FUZZY_PARSE)).booleanValue(); + trimDoubleQuotes = Boolean.valueOf(validParams.get(TRIM_DOUBLE_QUOTES)).booleanValue(); + skipLines = Integer.valueOf(validParams.getOrDefault(SKIP_LINES, "0")).intValue(); } public List<TBrokerFileStatus> getFileStatuses() { @@ -194,6 +201,8 @@ public abstract class ExternalFileTableValuedFunction extends TableValuedFunctio fileAttributes.setTextParams(fileTextScanRangeParams); if (this.fileFormatType == TFileFormatType.FORMAT_CSV_PLAIN) { fileAttributes.setHeaderType(this.headerType); + fileAttributes.setTrimDoubleQuotes(trimDoubleQuotes); + fileAttributes.setSkipLines(skipLines); } else if (this.fileFormatType == TFileFormatType.FORMAT_JSON) { fileAttributes.setJsonRoot(jsonRoot); fileAttributes.setJsonpaths(jsonPaths); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org