This is an automated email from the ASF dual-hosted git repository.

morningman pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 30d14b66231 [enhancement](hive)Initial support for Hive 
org.openx.data.jsonserde.JsonSerDe (#49209)
30d14b66231 is described below

commit 30d14b66231df36ee19e9d28713f88721235d8a9
Author: daidai <changyu...@selectdb.com>
AuthorDate: Wed Apr 9 11:57:19 2025 +0800

    [enhancement](hive)Initial support for Hive 
org.openx.data.jsonserde.JsonSerDe (#49209)
    
    ### What problem does this PR solve?
    
    Problem Summary:
    Initial support for Hive
    
`org.openx.data.jsonserde.JsonSerDe`(https://github.com/rcongiu/Hive-JSON-Serde).
    The specific behavior of read  is similar to pr #43469.
    
    By referring to the description in the link, here are some explanations:
    Support:
    1. Querying Complex Fields
    2. Importing Malformed Data (serde prop: ignore.malformed.json)
    
    Not supported, this parameter will not affect the query results
    1. dots.in.keys
    2. Case Sensitivity in mappings
    3. Mapping Hive Keywords
    
    Not supported, but will report an error:
    1. Using Arrays
    2. Promoting a Scalar to an Array
    error : [DATA_QUALITY_ERROR]JSON data is array-object,
    `strip_outer_array` must be TRUE.
    
    In order to allow some json strings that do not support parsing to be
    processed by users, a session variable is introduced:
    `read_hive_json_in_one_column` (default is false). When this variable is
    true, a whole line of json is read into the first column, and users can
    choose to process a whole line of json, such as JSON_PARSE. The data
    type of the first column of the table needs to be string. Currently only
    valid for org.openx.data.jsonserde.JsonSerDe.
---
 be/src/vec/exec/format/json/new_json_reader.cpp    |  33 +++++++-
 be/src/vec/exec/format/json/new_json_reader.h      |   8 +-
 .../hive/scripts/auxlib/json-serde-1.3.9.tar.gz    | Bin 0 -> 78992 bytes
 .../scripts/create_preinstalled_scripts/run76.hql  |  56 ++++++++++++++
 .../docker-compose/hive/scripts/hive-metastore.sh  |  11 +++
 .../json/openx_json/json_data_arrays_tb/1          |   2 +
 .../json/openx_json/json_one_column_table/1        |   5 ++
 .../preinstalled_data/json/openx_json/json_table/1 |   2 +
 .../preinstalled_data/json/openx_json/json_table/2 |  11 +++
 .../json/openx_json/scalar_to_array_tb/1           |   1 +
 .../doris/datasource/hive/HMSExternalTable.java    |   8 ++
 .../datasource/hive/HiveMetaStoreClientHelper.java |   1 +
 .../doris/datasource/hive/HiveProperties.java      |  12 +++
 .../doris/datasource/hive/source/HiveScanNode.java |  48 +++++++++++-
 .../java/org/apache/doris/qe/SessionVariable.java  |  21 ++++++
 gensrc/thrift/PlanNodes.thrift                     |   3 +
 .../hive/test_hive_openx_json.out                  | Bin 0 -> 709 bytes
 .../hive/test_hive_openx_json.groovy               |  84 +++++++++++++++++++++
 18 files changed, 297 insertions(+), 9 deletions(-)

diff --git a/be/src/vec/exec/format/json/new_json_reader.cpp 
b/be/src/vec/exec/format/json/new_json_reader.cpp
index 5820174e7fa..a125654eada 100644
--- a/be/src/vec/exec/format/json/new_json_reader.cpp
+++ b/be/src/vec/exec/format/json/new_json_reader.cpp
@@ -399,6 +399,20 @@ Status NewJsonReader::_get_range_params() {
     if (_range.table_format_params.table_format_type == "hive") {
         _is_hive_table = true;
     }
+    if (_params.file_attributes.__isset.openx_json_ignore_malformed) {
+        _openx_json_ignore_malformed = 
_params.file_attributes.openx_json_ignore_malformed;
+    }
+    return Status::OK();
+}
+
+static Status ignore_malformed_json_append_null(Block& block) {
+    for (auto& column : block.get_columns()) {
+        if (!column->is_nullable()) [[unlikely]] {
+            return Status::DataQualityError("malformed json, but the column 
`{}` is not nullable.",
+                                            column->get_name());
+        }
+        
static_cast<ColumnNullable*>(column->assume_mutable().get())->insert_default();
+    }
     return Status::OK();
 }
 
@@ -486,8 +500,13 @@ Status NewJsonReader::_vhandle_simple_json(RuntimeState* 
/*state*/, Block& block
         bool valid = false;
         if (_next_row >= _total_rows) { // parse json and generic document
             Status st = _parse_json(is_empty_row, eof);
-            if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
-                continue; // continue to read next (for load, after this , 
already append error to file.)
+            if (st.is<DATA_QUALITY_ERROR>()) {
+                if (_is_load) {
+                    continue; // continue to read next (for load, after this , 
already append error to file.)
+                } else if (_openx_json_ignore_malformed) {
+                    RETURN_IF_ERROR(ignore_malformed_json_append_null(block));
+                    continue;
+                }
             }
             RETURN_IF_ERROR(st);
             if (*is_empty_row) {
@@ -1296,9 +1315,15 @@ Status 
NewJsonReader::_simdjson_handle_simple_json(RuntimeState* /*state*/, Bloc
 
         // step2: get json value by json doc
         Status st = _get_json_value(&size, eof, &error, is_empty_row);
-        if (_is_load && st.is<DATA_QUALITY_ERROR>()) {
-            return Status::OK();
+        if (st.is<DATA_QUALITY_ERROR>()) {
+            if (_is_load) {
+                return Status::OK();
+            } else if (_openx_json_ignore_malformed) {
+                RETURN_IF_ERROR(ignore_malformed_json_append_null(block));
+                return Status::OK();
+            }
         }
+
         RETURN_IF_ERROR(st);
         if (*is_empty_row || *eof) {
             return Status::OK();
diff --git a/be/src/vec/exec/format/json/new_json_reader.h 
b/be/src/vec/exec/format/json/new_json_reader.h
index 430f8c7af18..6b42ca23b4f 100644
--- a/be/src/vec/exec/format/json/new_json_reader.h
+++ b/be/src/vec/exec/format/json/new_json_reader.h
@@ -293,18 +293,22 @@ private:
 
     int32_t skip_bitmap_col_idx {-1};
 
-    bool _is_load = true;
     //Used to indicate whether it is a stream load. When loading, only data 
will be inserted into columnString.
     //If an illegal value is encountered during the load process, 
`_append_error_msg` should be called
     //instead of directly returning `Status::DataQualityError`
+    bool _is_load = true;
 
-    bool _is_hive_table = false;
     // In hive : create table xxx ROW FORMAT SERDE 
'org.apache.hive.hcatalog.data.JsonSerDe';
     // Hive will not allow you to create columns with the same name but 
different case, including field names inside
     // structs, and will automatically convert uppercase names in create sql 
to lowercase.However, when Hive loads data
     // to table, the column names in the data may be uppercase,and there may 
be multiple columns with
     // the same name but different capitalization.We refer to the behavior of 
hive, convert all column names
     // in the data to lowercase,and use the last one as the insertion value
+    bool _is_hive_table = false;
+
+    // hive : org.openx.data.jsonserde.JsonSerDe, `ignore.malformed.json` prop.
+    // If the variable is true, `null` will be inserted for llegal json format 
instead of return error.
+    bool _openx_json_ignore_malformed = false;
 
     DataTypeSerDeSPtrs _serdes;
     vectorized::DataTypeSerDe::FormatOptions _serde_options;
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
 
b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
new file mode 100644
index 00000000000..1eb63aa7727
Binary files /dev/null and 
b/docker/thirdparties/docker-compose/hive/scripts/auxlib/json-serde-1.3.9.tar.gz
 differ
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
new file mode 100644
index 00000000000..1f83f932445
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/create_preinstalled_scripts/run76.hql
@@ -0,0 +1,56 @@
+create database if not exists openx_json;
+use openx_json;
+
+
+CREATE TABLE IF NOT EXISTS json_table (
+    id INT,
+    name STRING,
+    numbers ARRAY<INT>,
+    scores MAP<STRING, INT>,
+    details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table';
+
+
+CREATE TABLE IF NOT EXISTS json_table_ignore_malformed (
+    id INT,
+    name STRING,
+    numbers ARRAY<INT>,
+    scores MAP<STRING, INT>,
+    details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+WITH SERDEPROPERTIES ("ignore.malformed.json" = "true" )
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_table';
+
+
+CREATE TABLE json_data_arrays_tb (
+    name string, age int)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_data_arrays_tb';
+
+
+CREATE TABLE IF NOT EXISTS scalar_to_array_tb(
+    id INT,
+    name STRING,
+    tags ARRAY<STRING>
+)ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/scalar_to_array_tb';
+
+
+CREATE TABLE IF NOT EXISTS json_one_column_table (
+    name STRING,    
+    id INT,
+    numbers ARRAY<INT>,
+    scores MAP<STRING, INT>,
+    details STRUCT<a:INT, b:STRING, c:BIGINT>
+)
+ROW FORMAT SERDE 'org.openx.data.jsonserde.JsonSerDe'
+LOCATION '/user/doris/preinstalled_data/json/openx_json/json_one_column_table';
+
+msck repair table json_table;
+msck repair table json_table_ignore_malformed;
+msck repair table json_data_arrays_tb;
+msck repair table scalar_to_array_tb;
+msck repair table json_one_column_table;
\ No newline at end of file
diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh 
b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
index ac4c9ae4480..04ddf42c70c 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
+++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
@@ -20,6 +20,17 @@ set -e -x
 
 parallel=$(getconf _NPROCESSORS_ONLN)
 
+
+
+AUX_LIB="/mnt/scripts/auxlib"
+for file in "${AUX_LIB}"/*.tar.gz; do
+    [ -e "$file" ] || continue
+    tar -xzvf "$file" -C "$AUX_LIB"
+    echo "file = ${file}"
+done
+ls "${AUX_LIB}/"
+mv "${AUX_LIB}"/ /opt/hive
+
 nohup /opt/hive/bin/hive --service metastore &
 
 # wait lockfile
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
new file mode 100644
index 00000000000..098bb346b50
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_data_arrays_tb/1
@@ -0,0 +1,2 @@
+["John", 26 ]
+["Mary", 23 ]
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
new file mode 100644
index 00000000000..d396f66a079
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_one_column_table/1
@@ -0,0 +1,5 @@
+
+{"name":"bad1","id":5,"numbers":[1,2,3]
+[1,2,3]
+"just a                 string"
+{"name":"bad2","id":6,"numbers":"not an 
array","scores":{"key4":40},"details":{"a":4,"b":"text","c":4000000}}
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
new file mode 100644
index 00000000000..11a3edf6e80
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/1
@@ -0,0 +1,2 @@
+{"id": 1, "name": "Alice", "numbers": [1, 2, 3], "scores": {"math": 90, 
"english": 85}, "details": {"a": 100, "b": "test1", "c": 1234567890}}
+{"id": 2, "name": "Bob", "numbers": [4, 5], "scores": {"math": 80, "science": 
95}, "details": {"a": 200, "b": "test2", "c": 9876543210}}
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
new file mode 100644
index 00000000000..e77c1f49d85
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/json_table/2
@@ -0,0 +1,11 @@
+{"id"  3   "name"  "Bob",
+"numbers": [
+    4 5
+],
+"scores": {
+"math": 80
+},
+"details"
+: {
+"a": 200  , "b"  } "test2", "c": 9876543210
+}}
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
new file mode 100644
index 00000000000..24a9acc63a3
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/preinstalled_data/json/openx_json/scalar_to_array_tb/1
@@ -0,0 +1 @@
+{"name":"Charlie","id":4,"tags":"flink"}
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
index dfd7d6fe4b8..33c945272e7 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HMSExternalTable.java
@@ -1084,6 +1084,14 @@ public class HMSExternalTable extends ExternalTable 
implements MTMVRelatedTableI
     public void beforeMTMVRefresh(MTMV mtmv) throws DdlException {
     }
 
+    public boolean firstColumnIsString() {
+        List<Column> columns = getColumns();
+        if (columns == null || columns.isEmpty()) {
+            return false;
+        }
+        return columns.get(0).getType().isScalarType(PrimitiveType.STRING);
+    }
+
     public HoodieTableMetaClient getHudiClient() {
         return Env.getCurrentEnv()
             .getExtMetaCacheMgr()
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
index 9bb09225607..d98bf8227e1 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
@@ -95,6 +95,7 @@ public class HiveMetaStoreClientHelper {
 
     public static final String HIVE_JSON_SERDE = 
"org.apache.hive.hcatalog.data.JsonSerDe";
     public static final String LEGACY_HIVE_JSON_SERDE = 
"org.apache.hadoop.hive.serde2.JsonSerDe";
+    public static final String OPENX_JSON_SERDE = 
"org.openx.data.jsonserde.JsonSerDe";
 
     public enum HiveFileFormat {
         TEXT_FILE(0, "text"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index 74f3dcc1a9d..b06ea772fab 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -55,6 +55,11 @@ public class HiveProperties {
     public static final String PROP_ESCAPE_CHAR = OpenCSVSerde.ESCAPECHAR;
     public static final String DEFAULT_ESCAPE_CHAR = "\\";
 
+    // org.openx.data.jsonserde.JsonSerDe
+    public static final String PROP_OPENX_IGNORE_MALFORMED_JSON = 
"ignore.malformed.json";
+    public static final String DEFAULT_OPENX_IGNORE_MALFORMED_JSON = "false";
+
+
     public static final Set<String> HIVE_SERDE_PROPERTIES = ImmutableSet.of(
             PROP_FIELD_DELIMITER,
             PROP_COLLECTION_DELIMITER_HIVE2,
@@ -131,6 +136,13 @@ public class HiveProperties {
         return 
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_ESCAPE_CHAR, 
escapeChar);
     }
 
+    public static String getOpenxJsonIgnoreMalformed(Table table) {
+        Optional<String> escapeChar = 
HiveMetaStoreClientHelper.getSerdeProperty(table,
+                PROP_OPENX_IGNORE_MALFORMED_JSON);
+        return 
HiveMetaStoreClientHelper.firstPresentOrDefault(DEFAULT_OPENX_IGNORE_MALFORMED_JSON,
 escapeChar);
+    }
+
+
     // Set properties to table
     public static void setTableProperties(Table table, Map<String, String> 
properties) {
         HashMap<String, String> serdeProps = new HashMap<>();
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index 890f6147f33..c2503789580 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -125,7 +125,7 @@ public class HiveScanNode extends FileQueryScanNode {
             this.hiveTransaction = new 
HiveTransaction(DebugUtil.printId(ConnectContext.get().queryId()),
                     ConnectContext.get().getQualifiedUser(), hmsTable, 
hmsTable.isFullAcidTable());
             Env.getCurrentHiveTransactionMgr().register(hiveTransaction);
-            skipCheckingAcidVersionFile = 
ConnectContext.get().getSessionVariable().skipCheckingAcidVersionFile;
+            skipCheckingAcidVersionFile = 
sessionVariable.skipCheckingAcidVersionFile;
         }
     }
 
@@ -413,6 +413,17 @@ public class HiveScanNode extends FileQueryScanNode {
             if (serDeLib.equals(HiveMetaStoreClientHelper.HIVE_JSON_SERDE)
                     || 
serDeLib.equals(HiveMetaStoreClientHelper.LEGACY_HIVE_JSON_SERDE)) {
                 type = TFileFormatType.FORMAT_JSON;
+            } else if 
(serDeLib.equals(HiveMetaStoreClientHelper.OPENX_JSON_SERDE)) {
+                if (!sessionVariable.isReadHiveJsonInOneColumn()) {
+                    type = TFileFormatType.FORMAT_JSON;
+                } else if (sessionVariable.isReadHiveJsonInOneColumn()
+                        && hmsTable.firstColumnIsString()) {
+                    type = TFileFormatType.FORMAT_CSV_PLAIN;
+                } else {
+                    throw new UserException("You set 
read_hive_json_in_one_column = true, but the first column of "
+                            + "table " + hmsTable.getName()
+                            + " is not a string column.");
+                }
             } else {
                 type = TFileFormatType.FORMAT_CSV_PLAIN;
             }
@@ -449,7 +460,7 @@ public class HiveScanNode extends FileQueryScanNode {
             fileAttributes.setTextParams(textParams);
             fileAttributes.setHeaderType("");
             fileAttributes.setEnableTextValidateUtf8(
-                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
+                    sessionVariable.enableTextValidateUtf8);
         } else if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.OpenCSVSerde")) {
             TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
             // set set properties of OpenCSVSerde
@@ -467,7 +478,7 @@ public class HiveScanNode extends FileQueryScanNode {
                 fileAttributes.setTrimDoubleQuotes(true);
             }
             fileAttributes.setEnableTextValidateUtf8(
-                    
ConnectContext.get().getSessionVariable().enableTextValidateUtf8);
+                    sessionVariable.enableTextValidateUtf8);
         } else if (serDeLib.equals("org.apache.hive.hcatalog.data.JsonSerDe")) 
{
             TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
             textParams.setColumnSeparator("\t");
@@ -481,6 +492,37 @@ public class HiveScanNode extends FileQueryScanNode {
             fileAttributes.setReadJsonByLine(true);
             fileAttributes.setStripOuterArray(false);
             fileAttributes.setHeaderType("");
+        } else if (serDeLib.equals("org.openx.data.jsonserde.JsonSerDe")) {
+            if (!sessionVariable.isReadHiveJsonInOneColumn()) {
+                TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
+                textParams.setColumnSeparator("\t");
+                textParams.setLineDelimiter("\n");
+                fileAttributes.setTextParams(textParams);
+
+                fileAttributes.setJsonpaths("");
+                fileAttributes.setJsonRoot("");
+                fileAttributes.setNumAsString(true);
+                fileAttributes.setFuzzyParse(false);
+                fileAttributes.setReadJsonByLine(true);
+                fileAttributes.setStripOuterArray(false);
+                fileAttributes.setHeaderType("");
+
+                fileAttributes.setOpenxJsonIgnoreMalformed(
+                        
Boolean.parseBoolean(HiveProperties.getOpenxJsonIgnoreMalformed(table)));
+            } else if (sessionVariable.isReadHiveJsonInOneColumn()
+                    && hmsTable.firstColumnIsString()) {
+                TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
+                textParams.setLineDelimiter("\n");
+                textParams.setColumnSeparator("\n");
+                //First, perform row splitting according to `\n`. When 
performing column splitting,
+                // since there is no `\n`, only one column of data will be 
generated.
+                fileAttributes.setTextParams(textParams);
+                fileAttributes.setHeaderType("");
+            } else {
+                throw new UserException("You set read_hive_json_in_one_column 
= true, but the first column of table "
+                        + hmsTable.getName()
+                        + " is not a string column.");
+            }
         } else {
             throw new UserException(
                     "unsupported hive table serde: " + serDeLib);
diff --git a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java 
b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
index 86c47de612f..48ac010e830 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/qe/SessionVariable.java
@@ -708,6 +708,8 @@ public class SessionVariable implements Serializable, 
Writable {
             "enable_cooldown_replica_affinity";
     public static final String SKIP_CHECKING_ACID_VERSION_FILE = 
"skip_checking_acid_version_file";
 
+    public static final String READ_HIVE_JSON_IN_ONE_COLUMN = 
"read_hive_json_in_one_column";
+
     /**
      * Inserting overwrite for auto partition table allows creating partition 
for
      * datas which cannot find partition to overwrite.
@@ -1220,6 +1222,17 @@ public class SessionVariable implements Serializable, 
Writable {
     @VariableMgr.VarAttr(name = PARALLEL_PREPARE_THRESHOLD, fuzzy = true)
     public int parallelPrepareThreshold = 32;
 
+    @VariableMgr.VarAttr(name = READ_HIVE_JSON_IN_ONE_COLUMN,
+            description = {"在读取hive 
json的时候,由于存在一些不支持的json格式,我们默认会报错。为了让用户使用体验更好,"
+                    + 
"当该变量为true的时候,将一整行json读取到第一列中,用户可以自行选择对一整行json进行处理,例如JSON_PARSE。"
+                    + "需要表的第一列的数据类型为string.",
+                    "When reading hive json, we will report an error by 
default because there are some unsupported "
+                    + "json formats. In order to provide users with a better 
experience, when this variable is true,"
+                    + "a whole line of json is read into the first column. 
Users can choose to process a whole line"
+                    + "of json, such as JSON_PARSE. The data type of the first 
column of the table needs to"
+                    + "be string."})
+    private boolean readHiveJsonInOneColumn = false;
+
     @VariableMgr.VarAttr(name = ENABLE_COST_BASED_JOIN_REORDER)
     private boolean enableJoinReorderBasedCost = false;
 
@@ -3771,6 +3784,14 @@ public class SessionVariable implements Serializable, 
Writable {
         this.keepCarriageReturn = keepCarriageReturn;
     }
 
+    public boolean isReadHiveJsonInOneColumn() {
+        return readHiveJsonInOneColumn;
+    }
+
+    public void setReadHiveJsonInOneColumn(boolean readHiveJsonInOneColumn) {
+        this.readHiveJsonInOneColumn = readHiveJsonInOneColumn;
+    }
+
     public boolean isDropTableIfCtasFailed() {
         return dropTableIfCtasFailed;
     }
diff --git a/gensrc/thrift/PlanNodes.thrift b/gensrc/thrift/PlanNodes.thrift
index 407c77fc340..3165a6ac764 100644
--- a/gensrc/thrift/PlanNodes.thrift
+++ b/gensrc/thrift/PlanNodes.thrift
@@ -271,6 +271,9 @@ struct TFileAttributes {
     11: optional i32 skip_lines;
     //For text type file reading, whether to enable utf8 encoding 
check.(Catalog && TVF)
     12: optional bool enable_text_validate_utf8 = true;
+    // org.openx.data.jsonserde.JsonSerDe
+    13: optional bool openx_json_ignore_malformed = false;
+
     // for cloud copy into
     1001: optional bool ignore_csv_redundant_col;
 }
diff --git 
a/regression-test/data/external_table_p0/hive/test_hive_openx_json.out 
b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out
new file mode 100644
index 00000000000..6eadea56694
Binary files /dev/null and 
b/regression-test/data/external_table_p0/hive/test_hive_openx_json.out differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy 
b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy
new file mode 100644
index 00000000000..b9698809c4d
--- /dev/null
+++ b/regression-test/suites/external_table_p0/hive/test_hive_openx_json.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_hive_openx_json",  
"p0,external,hive,external_docker,external_docker_hive") {
+
+
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("diable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive3"]) {
+        try {
+            sql """set enable_fallback_to_original_planner=false"""
+            String externalEnvIp = 
context.config.otherConfigs.get("externalEnvIp")
+            String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+            String catalog_name = "${hivePrefix}_test_hive_openx_json"
+            String broker_name = "hdfs"
+
+            sql """drop catalog if exists ${catalog_name}"""
+            sql """create catalog if not exists ${catalog_name} properties (
+                'type'='hms',
+                'hive.metastore.uris'='thrift://${externalEnvIp}:${hms_port}'
+            );"""
+            sql """use `${catalog_name}`.`openx_json`"""
+
+            try {
+                sql  """ select * from json_table """;
+            } catch (Exception e) {
+                log.info(e.getMessage())
+                assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+            }
+
+            order_qt_q1  """  select * from json_table_ignore_malformed """
+
+
+            try{
+                sql  """ select * from json_data_arrays_tb """;
+            } catch (Exception e) {
+                log.info(e.getMessage())
+                assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+            }
+    
+    
+            try{
+                sql  """ select * from scalar_to_array_tb """;
+            } catch (Exception e) {
+                log.info(e.getMessage())
+                assertTrue(e.getMessage().contains("DATA_QUALITY_ERROR"))
+            }
+
+            sql """ set read_hive_json_in_one_column = true; """
+
+            order_qt_2 """ select * from json_data_arrays_tb """
+            order_qt_3 """ select * from json_one_column_table """
+
+            try{
+                sql  """ select * from scalar_to_array_tb """;
+            } catch (Exception e) {
+                log.info(e.getMessage())
+                assertTrue(e.getMessage().contains("is not a string column."))
+            }
+
+
+            sql """drop catalog if exists ${catalog_name}"""
+        } finally {
+        }
+    }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to