This is an automated email from the ASF dual-hosted git repository. morningman pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 30d14b66231 [enhancement](hive)Initial support for Hive org.openx.data.jsonserde.JsonSerDe (#49209) 30d14b66231 is described below commit 30d14b66231df36ee19e9d28713f88721235d8a9 Author: daidai <changyu...@selectdb.com> AuthorDate: Wed Apr 9 11:57:19 2025 +0800 [enhancement](hive)Initial support for Hive org.openx.data.jsonserde.JsonSerDe (#49209) ### What problem does this PR solve? Problem Summary: Initial support for Hive `org.openx.data.jsonserde.JsonSerDe`(https://github.com/rcongiu/Hive-JSON-Serde). The specific behavior of read is similar to pr #43469. By referring to the description in the link, here are some explanations: Support: 1. Querying Complex Fields 2. Importing Malformed Data (serde prop: ignore.malformed.json) Not supported, this parameter will not affect the query results 1. dots.in.keys 2. Case Sensitivity in mappings 3. Mapping Hive Keywords Not supported, but will report an error: 1. Using Arrays 2. Promoting a Scalar to an Array error : [DATA_QUALITY_ERROR]JSON data is array-object, `strip_outer_array` must be TRUE. In order to allow some json strings that do not support parsing to be processed by users, a session variable is introduced: `read_hive_json_in_one_column` (default is false). When this variable is true, a whole line of json is read into the first column, and users can choose to process a whole line of json, such as JSON_PARSE. The data type of the first column of the table needs to be string. Currently only valid for org.openx.data.jsonserde.JsonSerDe. --- be/src/vec/exec/format/json/new_json_reader.cpp | 33 +++++++- be/src/vec/exec/format/json/new_json_reader.h | 8 +- .../hive/scripts/auxlib/json-serde-1.3.9.tar.gz | Bin 0 -> 78992 bytes .../scripts/create_preinstalled_scripts/run76.hql | 56 ++++++++++++++ .../docker-compose/hive/scripts/hive-metastore.sh | 11 +++ .../json/openx_json/json_data_arrays_tb/1 | 2 + .../json/openx_json/json_one_column_table/1 | 5 ++ .../preinstalled_data/json/openx_json/json_table/1 | 2 + .../preinstalled_data/json/openx_json/json_table/2 | 11 +++ .../json/openx_json/scalar_to_array_tb/1 | 1 + .../doris/datasource/hive/HMSExternalTable.java | 8 ++ .../datasource/hive/HiveMetaStoreClientHelper.java | 1 + .../doris/datasource/hive/HiveProperties.java | 12 +++ .../doris/datasource/hive/source/HiveScanNode.java | 48 +++++++++++- .../java/org/apache/doris/qe/SessionVariable.java | 21 ++++++ gensrc/thrift/PlanNodes.thrift | 3 + .../hive/test_hive_openx_json.out | Bin 0 -> 709 bytes .../hive/test_hive_openx_json.groovy | 84 +++++++++++++++++++++ 18 files changed, 297 insertions(+), 9 deletions(-) diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp b/be/src/vec/exec/format/json/new_json_reader.cpp index 5820174e7fa..a125654eada 100644 --- a/be/src/vec/exec/format/json/new_json_reader.cpp +++ b/be/src/vec/exec/format/json/new_json_reader.cpp @@ -399,6 +399,20 @@ Status NewJsonReader::_get_range_params() { if (_range.table_format_params.table_format_type == "hive") { _is_hive_table = true; } + if (_params.file_attributes.__isset.openx_json_ignore_malformed) { + _openx_json_ignore_malformed = _params.file_attributes.openx_json_ignore_malformed; + } + return Status::OK(); +} + +static Status ignore_malformed_json_append_null(Block& block) { + for (auto& column : block.get_columns()) { + if (!column->is_nullable()) [[unlikely]] { + return Status::DataQualityError("malformed json, but the column `{}` is not nullable.", + column->get_name()); + } + static_cast<ColumnNullable*>(column->assume_mutable().get())->insert_default(); + } return Status::OK(); } @@ -486,8 +500,13 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState* /*state*/, Block& block bool valid = false; if (_next_row >= _total_rows) { // parse json and generic document Status st = _parse_json(is_empty_row, eof); - if (_is_load && st.is<DATA_QUALITY_ERROR>()) { - continue; // continue to read next (for load, after this , already append error to file.) + if (st.is<DATA_QUALITY_ERROR>()) { + if (_is_load) { + continue; // continue to read next (for load, after this , already append error to file.) + } else if (_openx_json_ignore_malformed) { + RETURN_IF_ERROR(ignore_malformed_json_append_null(block)); + continue; + } } RETURN_IF_ERROR(st); if (*is_empty_row) { @@ -1296,9 +1315,15 @@ Status NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc // step2: get json value by json doc Status st = _get_json_value(&size, eof, &error, is_empty_row); - if (_is_load && st.is<DATA_QUALITY_ERROR>()) { - return Status::OK(); + if (st.is<DATA_QUALITY_ERROR>()) { + if (_is_load) { + return Status::OK(); + } else if (_openx_json_ignore_malformed) { + RETURN_IF_ERROR(ignore_malformed_json_append_null(block)); + return Status::OK(); + } } + RETURN_IF_ERROR(st); if (*is_empty_row || *eof) { return Status::OK(); diff --git a/be/src/vec/exec/format/json/new_json_reader.h b/be/src/vec/exec/format/json/new_json_reader.h index 430f8c7af18..6b42ca23b4f 100644 --- a/be/src/vec/exec/format/json/new_json_reader.h +++ b/be/src/vec/exec/format/json/new_json_reader.h @@ -293,18 +293,22 @@ private: int32_t skip_bitmap_col_idx {-1}; - bool _is_load = true; //Used to indicate whether it is a stream load. When loading, only data will be inserted into columnString. //If an illegal value is encountered during the load process, `_append_error_msg` should be called //instead of directly returning `Status::DataQualityError` + bool _is_load = true; - bool _is_hive_table = false; // In hive : create table xxx ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe'; // Hive will not allow you to create columns with the same name but different case, including field names inside // structs, and will automatically convert uppercase names in create sql to lowercase.However, when Hive loads data // to table, the column names in the data may be uppercase,and there may be multiple columns with // the same name but different capitalization.We refer to the behavior of hive, convert all column names // in the data to lowercase,and use the last one as the insertion value + bool _is_hive_table = false; + + // hive : org.openx.data.jsonserde.JsonSerDe, `ignore.malformed.json` prop. + // If the variable is true, `null` will be inserted for llegal json format instead of return error. + bool _openx_json_ignore_malformed = false; DataTypeSerDeSPtrs _serdes; vectorized::DataTypeSerDe::FormatOptions _serde_options; diff --git a/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz new file mode 100644 index 00000000000..1eb63aa7727 Binary files /dev/null and b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz differ diff --git a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql new file mode 100644 index 00000000000..1f83f932445 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql @@ -0,0 +1,56 @@ +create database if not exists openx_json; +use openx_json; + + +CREATE TABLE IF NOT EXISTS json_table ( + id INT, + name STRING, + numbers ARRAY<INT>, + scores MAP<STRING, INT>, + details STRUCT<a:INT, b:STRING, c:BIGINT> +) +ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' +LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table'; + + +CREATE TABLE IF NOT EXISTS json_table_ignore_malformed ( + id INT, + name STRING, + numbers ARRAY<INT>, + scores MAP<STRING, INT>, + details STRUCT<a:INT, b:STRING, c:BIGINT> +) +ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' +WITH SERDEPROPERTIES ("ignore.malformed.json" = "true" ) +LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table'; + + +CREATE TABLE json_data_arrays_tb ( + name string, age int) +ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' +LOCATION '/user/doris/preinstalled_data/json/openx_json/json_data_arrays_tb'; + + +CREATE TABLE IF NOT EXISTS scalar_to_array_tb( + id INT, + name STRING, + tags ARRAY<STRING> +)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' +LOCATION '/user/doris/preinstalled_data/json/openx_json/scalar_to_array_tb'; + + +CREATE TABLE IF NOT EXISTS json_one_column_table ( + name STRING, + id INT, + numbers ARRAY<INT>, + scores MAP<STRING, INT>, + details STRUCT<a:INT, b:STRING, c:BIGINT> +) +ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe' +LOCATION '/user/doris/preinstalled_data/json/openx_json/json_one_column_table'; + +msck repair table json_table; +msck repair table json_table_ignore_malformed; +msck repair table json_data_arrays_tb; +msck repair table scalar_to_array_tb; +msck repair table json_one_column_table; \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index ac4c9ae4480..04ddf42c70c 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -20,6 +20,17 @@ set -e -x parallel=$(getconf _NPROCESSORS_ONLN) + + +AUX_LIB="/mnt/scripts/auxlib" +for file in "${AUX_LIB}"/*.tar.gz; do + [ -e "$file" ] || continue + tar -xzvf "$file" -C "$AUX_LIB" + echo "file = ${file}" +done +ls "${AUX_LIB}/" +mv "${AUX_LIB}"/ /opt/hive + nohup /opt/hive/bin/hive --service metastore & # wait lockfile diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1 new file mode 100644 index 00000000000..098bb346b50 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1 @@ -0,0 +1,2 @@ +["John", 26 ] +["Mary", 23 ] \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1 new file mode 100644 index 00000000000..d396f66a079 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1 @@ -0,0 +1,5 @@ + +{"name":"bad1","id":5,"numbers":[1,2,3] +[1,2,3] +"just a string" +{"name":"bad2","id":6,"numbers":"not an array","scores":{"key4":40},"details":{"a":4,"b":"text","c":4000000}} diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1 new file mode 100644 index 00000000000..11a3edf6e80 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1 @@ -0,0 +1,2 @@ +{"id": 1, "name": "Alice", "numbers": [1, 2, 3], "scores": {"math": 90, "english": 85}, "details": {"a": 100, "b": "test1", "c": 1234567890}} +{"id": 2, "name": "Bob", "numbers": [4, 5], "scores": {"math": 80, "science": 95}, "details": {"a": 200, "b": "test2", "c": 9876543210}} \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2 new file mode 100644 index 00000000000..e77c1f49d85 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2 @@ -0,0 +1,11 @@ +{"id" 3 "name" "Bob", +"numbers": [ + 4 5 +], +"scores": { +"math": 80 +}, +"details" +: { +"a": 200 , "b" } "test2", "c": 9876543210 +}} \ No newline at end of file diff --git a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1 b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1 new file mode 100644 index 00000000000..24a9acc63a3 --- /dev/null +++ b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1 @@ -0,0 +1 @@ +{"name":"Charlie","id":4,"tags":"flink"} \ No newline at end of file diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java index dfd7d6fe4b8..33c945272e7 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java @@ -1084,6 +1084,14 @@ public class HMSExternalTable extends ExternalTable implements MTMVRelatedTableI public void beforeMTMVRefresh(MTMV mtmv) throws DdlException { } + public boolean firstColumnIsString() { + List<Column> columns = getColumns(); + if (columns == null || columns.isEmpty()) { + return false; + } + return columns.get(0).getType().isScalarType(PrimitiveType.STRING); + } + public HoodieTableMetaClient getHudiClient() { return Env.getCurrentEnv() .getExtMetaCacheMgr() diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java index 9bb09225607..d98bf8227e1 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java @@ -95,6 +95,7 @@ public class HiveMetaStoreClientHelper { public static final String HIVE_JSON_SERDE = "org.apache.hive.hcatalog.data.JsonSerDe"; public static final String LEGACY_HIVE_JSON_SERDE = "org.apache.hadoop.hive.serde2.JsonSerDe"; + public static final String OPENX_JSON_SERDE = "org.openx.data.jsonserde.JsonSerDe"; public enum HiveFileFormat { TEXT_FILE(0, "text"), diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java index 74f3dcc1a9d..b06ea772fab 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java @@ -55,6 +55,11 @@ public class HiveProperties { public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR; public static final String DEFAULT_ESCAPE_CHAR = "\\"; + // org.openx.data.jsonserde.JsonSerDe + public static final String PROP_OPENX_IGNORE_MALFORMED_JSON = "ignore.malformed.json"; + public static final String DEFAULT_OPENX_IGNORE_MALFORMED_JSON = "false"; + + public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of( PROP_FIELD_DELIMITER, PROP_COLLECTION_DELIMITER_HIVE2, @@ -131,6 +136,13 @@ public class HiveProperties { return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, escapeChar); } + public static String getOpenxJsonIgnoreMalformed(Table table) { + Optional<String> escapeChar = HiveMetaStoreClientHelper.getSerdeProperty(table, + PROP_OPENX_IGNORE_MALFORMED_JSON); + return HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_OPENX_IGNORE_MALFORMED_JSON, escapeChar); + } + + // Set properties to table public static void setTableProperties(Table table, Map<String, String> properties) { HashMap<String, String> serdeProps = new HashMap<>(); diff --git a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java index 890f6147f33..c2503789580 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java +++ b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java @@ -125,7 +125,7 @@ public class HiveScanNode extends FileQueryScanNode { this.hiveTransaction = new HiveTransaction(DebugUtil.printId(ConnectContext.get().queryId()), ConnectContext.get().getQualifiedUser(), hmsTable, hmsTable.isFullAcidTable()); Env.getCurrentHiveTransactionMgr().register(hiveTransaction); - skipCheckingAcidVersionFile = ConnectContext.get().getSessionVariable().skipCheckingAcidVersionFile; + skipCheckingAcidVersionFile = sessionVariable.skipCheckingAcidVersionFile; } } @@ -413,6 +413,17 @@ public class HiveScanNode extends FileQueryScanNode { if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE) || serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) { type = TFileFormatType.FORMAT_JSON; + } else if (serDeLib.equals(HiveMetaStoreClientHelper.OPENX_JSON_SERDE)) { + if (!sessionVariable.isReadHiveJsonInOneColumn()) { + type = TFileFormatType.FORMAT_JSON; + } else if (sessionVariable.isReadHiveJsonInOneColumn() + && hmsTable.firstColumnIsString()) { + type = TFileFormatType.FORMAT_CSV_PLAIN; + } else { + throw new UserException("You set read_hive_json_in_one_column = true, but the first column of " + + "table " + hmsTable.getName() + + " is not a string column."); + } } else { type = TFileFormatType.FORMAT_CSV_PLAIN; } @@ -449,7 +460,7 @@ public class HiveScanNode extends FileQueryScanNode { fileAttributes.setTextParams(textParams); fileAttributes.setHeaderType(""); fileAttributes.setEnableTextValidateUtf8( - ConnectContext.get().getSessionVariable().enableTextValidateUtf8); + sessionVariable.enableTextValidateUtf8); } else if (serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); // set set properties of OpenCSVSerde @@ -467,7 +478,7 @@ public class HiveScanNode extends FileQueryScanNode { fileAttributes.setTrimDoubleQuotes(true); } fileAttributes.setEnableTextValidateUtf8( - ConnectContext.get().getSessionVariable().enableTextValidateUtf8); + sessionVariable.enableTextValidateUtf8); } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) { TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); textParams.setColumnSeparator("\t"); @@ -481,6 +492,37 @@ public class HiveScanNode extends FileQueryScanNode { fileAttributes.setReadJsonByLine(true); fileAttributes.setStripOuterArray(false); fileAttributes.setHeaderType(""); + } else if (serDeLib.equals("org.openx.data.jsonserde.JsonSerDe")) { + if (!sessionVariable.isReadHiveJsonInOneColumn()) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + textParams.setColumnSeparator("\t"); + textParams.setLineDelimiter("\n"); + fileAttributes.setTextParams(textParams); + + fileAttributes.setJsonpaths(""); + fileAttributes.setJsonRoot(""); + fileAttributes.setNumAsString(true); + fileAttributes.setFuzzyParse(false); + fileAttributes.setReadJsonByLine(true); + fileAttributes.setStripOuterArray(false); + fileAttributes.setHeaderType(""); + + fileAttributes.setOpenxJsonIgnoreMalformed( + Boolean.parseBoolean(HiveProperties.getOpenxJsonIgnoreMalformed(table))); + } else if (sessionVariable.isReadHiveJsonInOneColumn() + && hmsTable.firstColumnIsString()) { + TFileTextScanRangeParams textParams = new TFileTextScanRangeParams(); + textParams.setLineDelimiter("\n"); + textParams.setColumnSeparator("\n"); + //First, perform row splitting according to `\n`. When performing column splitting, + // since there is no `\n`, only one column of data will be generated. + fileAttributes.setTextParams(textParams); + fileAttributes.setHeaderType(""); + } else { + throw new UserException("You set read_hive_json_in_one_column = true, but the first column of table " + + hmsTable.getName() + + " is not a string column."); + } } else { throw new UserException( "unsupported hive table serde: " + serDeLib); diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java index 86c47de612f..48ac010e830 100644 --- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java +++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java @@ -708,6 +708,8 @@ public class SessionVariable implements Serializable, Writable { "enable_cooldown_replica_affinity"; public static final String SKIP_CHECKING_ACID_VERSION_FILE = "skip_checking_acid_version_file"; + public static final String READ_HIVE_JSON_IN_ONE_COLUMN = "read_hive_json_in_one_column"; + /** * Inserting overwrite for auto partition table allows creating partition for * datas which cannot find partition to overwrite. @@ -1220,6 +1222,17 @@ public class SessionVariable implements Serializable, Writable { @VariableMgr.VarAttr(name = PARALLEL_PREPARE_THRESHOLD, fuzzy = true) public int parallelPrepareThreshold = 32; + @VariableMgr.VarAttr(name = READ_HIVE_JSON_IN_ONE_COLUMN, + description = {"在读取hive json的时候,由于存在一些不支持的json格式,我们默认会报错。为了让用户使用体验更好," + + "当该变量为true的时候,将一整行json读取到第一列中,用户可以自行选择对一整行json进行处理,例如JSON_PARSE。" + + "需要表的第一列的数据类型为string.", + "When reading hive json, we will report an error by default because there are some unsupported " + + "json formats. In order to provide users with a better experience, when this variable is true," + + "a whole line of json is read into the first column. Users can choose to process a whole line" + + "of json, such as JSON_PARSE. The data type of the first column of the table needs to" + + "be string."}) + private boolean readHiveJsonInOneColumn = false; + @VariableMgr.VarAttr(name = ENABLE_COST_BASED_JOIN_REORDER) private boolean enableJoinReorderBasedCost = false; @@ -3771,6 +3784,14 @@ public class SessionVariable implements Serializable, Writable { this.keepCarriageReturn = keepCarriageReturn; } + public boolean isReadHiveJsonInOneColumn() { + return readHiveJsonInOneColumn; + } + + public void setReadHiveJsonInOneColumn(boolean readHiveJsonInOneColumn) { + this.readHiveJsonInOneColumn = readHiveJsonInOneColumn; + } + public boolean isDropTableIfCtasFailed() { return dropTableIfCtasFailed; } diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift index 407c77fc340..3165a6ac764 100644 --- a/gensrc/thrift/PlanNodes.thrift +++ b/gensrc/thrift/PlanNodes.thrift @@ -271,6 +271,9 @@ struct TFileAttributes { 11: optional i32 skip_lines; //For text type file reading, whether to enable utf8 encoding check.(Catalog && TVF) 12: optional bool enable_text_validate_utf8 = true; + // org.openx.data.jsonserde.JsonSerDe + 13: optional bool openx_json_ignore_malformed = false; + // for cloud copy into 1001: optional bool ignore_csv_redundant_col; } diff --git a/regression-test/data/external_table_p0/hive/test_hive_openx_json.out b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out new file mode 100644 index 00000000000..6eadea56694 Binary files /dev/null and b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out differ diff --git a/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy new file mode 100644 index 00000000000..b9698809c4d --- /dev/null +++ b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +suite("test_hive_openx_json", "p0,external,hive,external_docker,external_docker_hive") { + + + String enabled = context.config.otherConfigs.get("enableHiveTest") + if (enabled == null || !enabled.equalsIgnoreCase("true")) { + logger.info("diable Hive test.") + return; + } + + for (String hivePrefix : ["hive3"]) { + try { + sql """set enable_fallback_to_original_planner=false""" + String externalEnvIp = context.config.otherConfigs.get("externalEnvIp") + String hms_port = context.config.otherConfigs.get(hivePrefix + "HmsPort") + String catalog_name = "${hivePrefix}_test_hive_openx_json" + String broker_name = "hdfs" + + sql """drop catalog if exists ${catalog_name}""" + sql """create catalog if not exists ${catalog_name} properties ( + 'type'='hms', + 'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}' + );""" + sql """use `${catalog_name}`.`openx_json`""" + + try { + sql """ select * from json_table """; + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR")) + } + + order_qt_q1 """ select * from json_table_ignore_malformed """ + + + try{ + sql """ select * from json_data_arrays_tb """; + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR")) + } + + + try{ + sql """ select * from scalar_to_array_tb """; + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR")) + } + + sql """ set read_hive_json_in_one_column = true; """ + + order_qt_2 """ select * from json_data_arrays_tb """ + order_qt_3 """ select * from json_one_column_table """ + + try{ + sql """ select * from scalar_to_array_tb """; + } catch (Exception e) { + log.info(e.getMessage()) + assertTrue(e.getMessage().contains("is not a string column.")) + } + + + sql """drop catalog if exists ${catalog_name}""" + } finally { + } + } +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org