(doris) branch branch-3.1 updated: branch-3.1: [feat](catalog) Support reading Hive table with MultiDelimitSerDe #51936 (#52772)

morrysnow Sat, 05 Jul 2025 05:24:41 -0700

This is an automated email from the ASF dual-hosted git repository.

morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git



The following commit(s) were added to refs/heads/branch-3.1 by this push:
     new 27bb79b33bc branch-3.1: [feat](catalog) Support reading Hive table 
with MultiDelimitSerDe #51936 (#52772)
27bb79b33bc is described below

commit 27bb79b33bcdff6e6ed857c43168ff3c762f07b7
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Sat Jul 5 20:24:29 2025 +0800

    branch-3.1: [feat](catalog) Support reading Hive table with 
MultiDelimitSerDe #51936 (#52772)
    
    bp #51936
    
    Co-authored-by: lw112 <[email protected]>
---
 be/src/vec/exec/format/text/text_reader.cpp        |  56 ++++++++++++
 be/src/vec/exec/format/text/text_reader.h          |   3 +
 .../format/text/hive_text_field_splitter_test.cpp  |  97 +++++++++++++++++++++
 .../multi_delimit_serde/create_table.hql           |  78 +++++++++++++++++
 .../datasource/hive/HiveMetaStoreClientHelper.java |   1 +
 .../doris/datasource/hive/HiveProperties.java      |   9 +-
 .../doris/datasource/hive/source/HiveScanNode.java |  12 ++-
 .../org/apache/doris/planner/HiveTableSink.java    |  10 ++-
 .../hive/test_multi_delimit_serde.out              | Bin 0 -> 484 bytes
 .../hive/test_multi_delimit_serde.groovy           |  84 ++++++++++++++++++
 10 files changed, 343 insertions(+), 7 deletions(-)

diff --git a/be/src/vec/exec/format/text/text_reader.cpp 
b/be/src/vec/exec/format/text/text_reader.cpp
index cf33623d320..7913a9bdb2b 100644
--- a/be/src/vec/exec/format/text/text_reader.cpp
+++ b/be/src/vec/exec/format/text/text_reader.cpp
@@ -21,6 +21,9 @@
 #include <gen_cpp/Types_types.h>
 #include <glog/logging.h>
 
+#include <cstddef>
+#include <vector>
+
 #include "common/compiler_util.h" // IWYU pragma: keep
 #include "common/status.h"
 #include "exec/line_reader.h"
@@ -39,6 +42,15 @@ namespace doris::vectorized {
 #include "common/compile_check_begin.h"
 
 void HiveTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>* 
splitted_values) {
+    if (_value_sep_len == 1) {
+        _split_field_single_char(line, splitted_values);
+    } else {
+        _split_field_multi_char(line, splitted_values);
+    }
+}
+
+void HiveTextFieldSplitter::_split_field_single_char(const Slice& line,
+                                                     std::vector<Slice>* 
splitted_values) {
     const char* data = line.data;
     const size_t size = line.size;
     size_t value_start = 0;
@@ -55,6 +67,50 @@ void HiveTextFieldSplitter::do_split(const Slice& line, 
std::vector<Slice>* spli
     process_value_func(data, value_start, size - value_start, _trimming_char, 
splitted_values);
 }
 
+void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line,
+                                                    std::vector<Slice>* 
splitted_values) {
+    const char* data = line.data;
+    const size_t size = line.size;
+    size_t start = 0;
+
+    std::vector<int> next(_value_sep_len);
+    next[0] = -1;
+    for (int i = 1, j = -1; i < (int)_value_sep_len; i++) {
+        while (j >= 0 && _value_sep[i] != _value_sep[j + 1]) {
+            j = next[j];
+        }
+        if (_value_sep[i] == _value_sep[j + 1]) {
+            j++;
+        }
+        next[i] = j;
+    }
+
+    // KMP search
+    for (int i = 0, j = -1; i < (int)size; i++) {
+        while (j >= 0 && data[i] != _value_sep[j + 1]) {
+            j = next[j];
+        }
+        if (data[i] == _value_sep[j + 1]) {
+            j++;
+        }
+        if (j == (int)_value_sep_len - 1) {
+            size_t curpos = i - _value_sep_len + 1;
+            if (_escape_char != 0 && curpos > 0 && data[curpos - 1] == 
_escape_char) {
+                j = next[j];
+                continue;
+            }
+
+            if (curpos >= start) {
+                process_value_func(data, start, curpos - start, 
_trimming_char, splitted_values);
+                start = curpos + _value_sep_len;
+            }
+
+            j = next[j];
+        }
+    }
+    process_value_func(data, start, size - start, _trimming_char, 
splitted_values);
+}
+
 TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile, 
ScannerCounter* counter,
                        const TFileScanRangeParams& params, const 
TFileRangeDesc& range,
                        const std::vector<SlotDescriptor*>& file_slot_descs, 
io::IOContext* io_ctx)
diff --git a/be/src/vec/exec/format/text/text_reader.h 
b/be/src/vec/exec/format/text/text_reader.h
index c1a873de5f2..88d6746a19f 100644
--- a/be/src/vec/exec/format/text/text_reader.h
+++ b/be/src/vec/exec/format/text/text_reader.h
@@ -43,6 +43,9 @@ public:
     void do_split(const Slice& line, std::vector<Slice>* splitted_values);
 
 private:
+    void _split_field_single_char(const Slice& line, std::vector<Slice>* 
splitted_values);
+    void _split_field_multi_char(const Slice& line, std::vector<Slice>* 
splitted_values);
+
     std::string _value_sep;
     char _escape_char;
 };
diff --git a/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp 
b/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp
new file mode 100644
index 00000000000..814af4554e4
--- /dev/null
+++ b/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "vec/exec/format/text/text_reader.h"
+
+namespace doris::vectorized {
+
+class HiveTextFieldSplitterTest : public testing::Test {
+protected:
+    void verify_field_split(const std::string& input, const std::string& 
delimiter,
+                            const std::vector<std::string>& expected_fields, 
char escape_char = 0) {
+        HiveTextFieldSplitter splitter(false, false, delimiter, 
delimiter.size(), 0, escape_char);
+        Slice line(input.data(), input.size());
+        std::vector<Slice> splitted_values;
+
+        splitter.do_split(line, &splitted_values);
+
+        ASSERT_EQ(expected_fields.size(), splitted_values.size())
+                << "Input: " << input << ", Delimiter: " << delimiter;
+
+        for (size_t i = 0; i < expected_fields.size(); ++i) {
+            std::string actual(splitted_values[i].data, 
splitted_values[i].size);
+            EXPECT_EQ(expected_fields[i], actual) << "Field " << i << " 
mismatch. Input: " << input
+                                                  << ", Delimiter: " << 
delimiter;
+        }
+    }
+};
+
+// Test single character delimiter (basic functionality)
+TEST_F(HiveTextFieldSplitterTest, SingleCharDelimiter) {
+    verify_field_split("a,b,c", ",", {"a", "b", "c"});
+    verify_field_split("1|2|3|4", "|", {"1", "2", "3", "4"});
+    verify_field_split("", ",", {""});
+    verify_field_split(",", ",", {"", ""});
+    verify_field_split("a,", ",", {"a", ""});
+    verify_field_split(",b", ",", {"", "b"});
+}
+
+// Test multi-character delimiter (core functionality for MultiDelimitSerDe)
+TEST_F(HiveTextFieldSplitterTest, MultiCharDelimiter) {
+    verify_field_split("a||b||c", "||", {"a", "b", "c"});
+    verify_field_split("1|+|2|+|3", "|+|", {"1", "2", "3"});
+    verify_field_split("field1|+|field2|+|field3", "|+|", {"field1", "field2", 
"field3"});
+
+    verify_field_split("", "||", {""});
+    verify_field_split("||", "||", {"", ""});
+    verify_field_split("a||", "||", {"a", ""});
+    verify_field_split("||b", "||", {"", "b"});
+}
+
+// Test overlapping patterns in delimiter - these are the problematic cases
+TEST_F(HiveTextFieldSplitterTest, OverlappingPatterns) {
+    verify_field_split("ab\\ababab", "abab", {"ab\\", "ab"});
+
+    verify_field_split("aaaaaaa", "aaa", {"", "", "a"});
+
+    verify_field_split("abcabcabc", "abcabc", {"", "abc"});
+
+    verify_field_split("ababababab", "abab", {"", "", "ab"});
+}
+
+// Test escape character functionality
+TEST_F(HiveTextFieldSplitterTest, EscapeCharacter) {
+    verify_field_split("a\\,b,c", ",", {"a\\,b", "c"}, '\\');
+    verify_field_split("a\\||b||c", "||", {"a\\||b", "c"}, '\\');
+    verify_field_split("field1\\|+|field2|+|field3", "|+|", 
{"field1\\|+|field2", "field3"}, '\\');
+}
+
+// Test real-world scenarios
+TEST_F(HiveTextFieldSplitterTest, RealWorldScenarios) {
+    verify_field_split("1|+|100|+|test1", "|+|", {"1", "100", "test1"});
+    verify_field_split("[email protected]|+|John Doe|+|Manager", "|+|",
+                       {"[email protected]", "John Doe", "Manager"});
+    verify_field_split("|+||+|", "|+|", {"", "", ""});
+    verify_field_split("a|+||+|c", "|+|", {"a", "", "c"});
+}
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git 
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
new file mode 100644
index 00000000000..cdaead8edf9
--- /dev/null
+++ 
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
@@ -0,0 +1,78 @@
+CREATE DATABASE IF NOT EXISTS regression;
+USE regression;
+
+CREATE TABLE `multi_delimit_test`(
+  `k1` int,
+  `k2` int,
+  `name` string)
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+  'field.delim'='|+|',
+  'mapkey.delim'='@',
+  'collection.delim'=':',
+  'serialization.format'='1',
+  'serialization.encoding'='UTF-8')
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_test'
+TBLPROPERTIES (
+  'transient_lastDdlTime'='1692719456');
+
+CREATE TABLE `multi_delimit_test2`(
+  `id` int,
+  `value` double,
+  `description` string)
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+  'field.delim'='||',
+  'serialization.format'='1')
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_test2'
+TBLPROPERTIES (
+  'transient_lastDdlTime'='1692719456');
+
+-- Test table with array and map types to test collection.delim and 
mapkey.delim
+CREATE TABLE `multi_delimit_complex_test`(
+  `id` int,
+  `name` string,
+  `tags` array<string>,
+  `properties` map<string,string>,
+  `nested_array` array<array<int>>)
+ROW FORMAT SERDE
+  'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+  'field.delim'='|+|',
+  'mapkey.delim'='@',
+  'collection.delim'=':',
+  'serialization.format'='1',
+  'serialization.encoding'='UTF-8')
+STORED AS INPUTFORMAT
+  'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+  'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_complex_test'
+TBLPROPERTIES (
+  'transient_lastDdlTime'='1692719456');
+
+INSERT INTO multi_delimit_test VALUES
+  (1, 100, 'test1'),
+  (2, 200, 'test2'),
+  (3, 300, 'test3');
+
+INSERT INTO multi_delimit_test2 VALUES
+  (1, 1.5, 'description1'),
+  (2, 2.5, 'description2'),
+  (3, 3.5, 'description3');
+
+-- Insert test data with complex types
+-- Format: 
id|+|name|+|array_elements:separated:by:colon|+|key1@value1:key2@value2|+|nested_array_format
+INSERT INTO multi_delimit_complex_test VALUES
+  (1, 'user1', array('tag1', 'tag2', 'tag3'), map('key1', 'value1', 'key2', 
'value2'), array(array(1, 2), array(3, 4))),
+  (2, 'user2', array('tagA', 'tagB'), map('prop1', 'val1'), array(array(5, 
6)));
\ No newline at end of file
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
index f59143961e6..eec4349669c 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
@@ -98,6 +98,7 @@ public class HiveMetaStoreClientHelper {
     public static final String OPENX_JSON_SERDE = 
"org.openx.data.jsonserde.JsonSerDe";
     public static final String HIVE_TEXT_SERDE = 
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe";
     public static final String HIVE_CSV_SERDE = 
"org.apache.hadoop.hive.serde2.OpenCSVSerde";
+    public static final String HIVE_MULTI_DELIMIT_SERDE = 
"org.apache.hadoop.hive.serde2.MultiDelimitSerDe";
 
     public enum HiveFileFormat {
         TEXT_FILE(0, "text"),
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index bdc8e0cacd9..1be78e41b89 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -81,11 +81,16 @@ public class HiveProperties {
             PROP_SKIP_FOOTER_COUNT);
 
     public static String getFieldDelimiter(Table table) {
+        return getFieldDelimiter(table, false);
+    }
+
+    public static String getFieldDelimiter(Table table, boolean 
supportMultiChar) {
         // This method is used for text format.
         Optional<String> fieldDelim = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
         Optional<String> serFormat = 
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
-        return 
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
-                DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
+        String delimiter = HiveMetaStoreClientHelper.firstPresentOrDefault(
+                DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat);
+        return supportMultiChar ? delimiter : 
HiveMetaStoreClientHelper.getByte(delimiter);
     }
 
     public static String getSeparatorChar(Table table) {
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index e096ee30e72..0b316efaa3a 100644
--- 
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++ 
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -431,6 +431,8 @@ public class HiveScanNode extends FileQueryScanNode {
                 type = TFileFormatType.FORMAT_TEXT;
             } else if 
(serDeLib.equals(HiveMetaStoreClientHelper.HIVE_CSV_SERDE)) {
                 type = TFileFormatType.FORMAT_CSV_PLAIN;
+            } else if 
(serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE)) {
+                type = TFileFormatType.FORMAT_TEXT;
             } else {
                 throw new UserException("Unsupported hive table serde: " + 
serDeLib);
             }
@@ -451,11 +453,13 @@ public class HiveScanNode extends FileQueryScanNode {
         // TODO: support skip footer count
         fileAttributes.setSkipLines(HiveProperties.getSkipHeaderCount(table));
         String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
-        if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
+        if 
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
+                || 
serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE)) {
             TFileTextScanRangeParams textParams = new 
TFileTextScanRangeParams();
-            // set properties of LazySimpleSerDe
-            // 1. set column separator
-            
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
+            // set properties of LazySimpleSerDe and MultiDelimitSerDe
+            // 1. set column separator (MultiDelimitSerDe supports 
multi-character delimiters)
+            boolean supportMultiChar = 
serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE);
+            
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table, 
supportMultiChar));
             // 2. set line delimiter
             
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
             // 3. set mapkv delimiter
diff --git 
a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java 
b/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
index bb4786f226a..fdbc1ffc948 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
@@ -25,6 +25,7 @@ import org.apache.doris.common.AnalysisException;
 import org.apache.doris.common.util.LocationPath;
 import org.apache.doris.datasource.hive.HMSExternalCatalog;
 import org.apache.doris.datasource.hive.HMSExternalTable;
+import org.apache.doris.datasource.hive.HiveMetaStoreClientHelper;
 import org.apache.doris.datasource.hive.HiveProperties;
 import 
org.apache.doris.nereids.trees.plans.commands.insert.HiveInsertCommandContext;
 import 
org.apache.doris.nereids.trees.plans.commands.insert.InsertCommandContext;
@@ -59,6 +60,7 @@ public class HiveTableSink extends BaseExternalTableDataSink {
             add(TFileFormatType.FORMAT_CSV_PLAIN);
             add(TFileFormatType.FORMAT_ORC);
             add(TFileFormatType.FORMAT_PARQUET);
+            add(TFileFormatType.FORMAT_TEXT);
         }};
 
     public HiveTableSink(HMSExternalTable targetTable) {
@@ -175,6 +177,7 @@ public class HiveTableSink extends 
BaseExternalTableDataSink {
                 compressType = 
targetTable.getRemoteTable().getParameters().get("parquet.compression");
                 break;
             case FORMAT_CSV_PLAIN:
+            case FORMAT_TEXT:
                 compressType = 
targetTable.getRemoteTable().getParameters().get("text.compression");
                 if (Strings.isNullOrEmpty(compressType)) {
                     compressType = 
ConnectContext.get().getSessionVariable().hiveTextCompression();
@@ -213,8 +216,13 @@ public class HiveTableSink extends 
BaseExternalTableDataSink {
     private void setSerDeProperties(THiveTableSink tSink) {
         THiveSerDeProperties serDeProperties = new THiveSerDeProperties();
         Table table = targetTable.getRemoteTable();
+        String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
         // 1. set field delimiter
-        serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table));
+        if 
(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE.equals(serDeLib)) {
+            
serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table, true));
+        } else {
+            
serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table));
+        }
         // 2. set line delimiter
         serDeProperties.setLineDelim(HiveProperties.getLineDelimiter(table));
         // 3. set collection delimiter
diff --git 
a/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out 
b/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out
new file mode 100644
index 00000000000..f2bac26a2a1
Binary files /dev/null and 
b/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out 
differ
diff --git 
a/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy 
b/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy
new file mode 100644
index 00000000000..8823a169ff5
--- /dev/null
+++ 
b/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_multi_delimit_serde", 
"p0,external,hive,external_docker,external_docker_hive") {
+    String enabled = context.config.otherConfigs.get("enableHiveTest")
+    if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+        logger.info("disable Hive test.")
+        return;
+    }
+
+    for (String hivePrefix : ["hive2", "hive3"]) {
+        String hms_port = context.config.otherConfigs.get(hivePrefix + 
"HmsPort")
+        String catalog_name = "${hivePrefix}_test_multi_delimit_serde"
+        String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+        sql """drop catalog if exists ${catalog_name}"""
+        sql """create catalog if not exists ${catalog_name} properties (
+            "type"="hms",
+            'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+        );"""
+
+        logger.info("catalog " + catalog_name + " created")
+        sql """switch ${catalog_name};"""
+        logger.info("switched to catalog " + catalog_name)
+
+        sql """use regression;"""
+
+        try {
+            // Test 1: MultiDelimitSerDe with |+| delimiter - using 
pre-created table
+            qt_01 """SELECT * FROM multi_delimit_test ORDER BY k1"""
+
+            // Test 2: Different multi-character delimiter - using pre-created 
table
+            qt_02 """SELECT * FROM multi_delimit_test2 ORDER BY id"""
+
+            // Test 3: Complex types with array and map to test 
collection.delim and mapkey.delim
+            logger.info("Test 3: Using pre-created table with array and map 
types")
+            qt_03 """SELECT id, name, tags, properties FROM 
multi_delimit_complex_test ORDER BY id"""
+
+            // Test 4: Insert data using Doris to write to Hive 
MultiDelimitSerDe tables
+            logger.info("Test 4: Testing Doris INSERT to Hive 
MultiDelimitSerDe tables")
+
+            // Test 4.1: Insert to basic multi-delimit table
+            sql """INSERT INTO multi_delimit_test VALUES (4, 400, 'test4'), 
(5, 500, 'test5')"""
+            qt_04 """SELECT * FROM multi_delimit_test WHERE k1 >= 4 ORDER BY 
k1"""
+
+            // Test 4.2: Insert to double-pipe delimited table
+            sql """INSERT INTO multi_delimit_test2 VALUES (4, 4.5, 
'description4'), (5, 5.5, 'description5')"""
+            qt_05 """SELECT * FROM multi_delimit_test2 WHERE id >= 4 ORDER BY 
id"""
+
+            // Test 4.3: Insert to complex types table with arrays and maps
+            sql """INSERT INTO multi_delimit_complex_test VALUES
+                (3, 'user3', ARRAY('tagX', 'tagY'), MAP('newkey', 'newvalue'), 
ARRAY(ARRAY(7, 8)))"""
+            qt_06 """SELECT id, name, tags, properties FROM 
multi_delimit_complex_test WHERE id = 3 ORDER BY id"""
+
+            // Test 5: Show create table to check SerDe properties
+            logger.info("Test 5: Checking show create table")
+            def createTableResult = sql """SHOW CREATE TABLE 
multi_delimit_test"""
+            logger.info("Create table result: " + createTableResult.toString())
+
+            
assertTrue(createTableResult.toString().contains("MultiDelimitSerDe"))
+            assertTrue(createTableResult.toString().contains("field.delim"))
+        } catch (Exception e) {
+             logger.warn("Test failed, this might be expected if Hive version 
doesn't support MultiDelimitSerDe: " + e.getMessage())
+            if (e.getMessage().contains("Unsupported hive table serde")) {
+                logger.info("Got expected 'Unsupported hive table serde' error 
before implementing MultiDelimitSerDe support")
+            }
+        }
+        sql """drop catalog if exists ${catalog_name}"""
+    }
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

(doris) branch branch-3.1 updated: branch-3.1: [feat](catalog) Support reading Hive table with MultiDelimitSerDe #51936 (#52772)

Reply via email to