This is an automated email from the ASF dual-hosted git repository.
morrysnow pushed a commit to branch branch-3.1
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.1 by this push:
new 27bb79b33bc branch-3.1: [feat](catalog) Support reading Hive table
with MultiDelimitSerDe #51936 (#52772)
27bb79b33bc is described below
commit 27bb79b33bcdff6e6ed857c43168ff3c762f07b7
Author: Mingyu Chen (Rayner) <[email protected]>
AuthorDate: Sat Jul 5 20:24:29 2025 +0800
branch-3.1: [feat](catalog) Support reading Hive table with
MultiDelimitSerDe #51936 (#52772)
bp #51936
Co-authored-by: lw112 <[email protected]>
---
be/src/vec/exec/format/text/text_reader.cpp | 56 ++++++++++++
be/src/vec/exec/format/text/text_reader.h | 3 +
.../format/text/hive_text_field_splitter_test.cpp | 97 +++++++++++++++++++++
.../multi_delimit_serde/create_table.hql | 78 +++++++++++++++++
.../datasource/hive/HiveMetaStoreClientHelper.java | 1 +
.../doris/datasource/hive/HiveProperties.java | 9 +-
.../doris/datasource/hive/source/HiveScanNode.java | 12 ++-
.../org/apache/doris/planner/HiveTableSink.java | 10 ++-
.../hive/test_multi_delimit_serde.out | Bin 0 -> 484 bytes
.../hive/test_multi_delimit_serde.groovy | 84 ++++++++++++++++++
10 files changed, 343 insertions(+), 7 deletions(-)
diff --git a/be/src/vec/exec/format/text/text_reader.cpp
b/be/src/vec/exec/format/text/text_reader.cpp
index cf33623d320..7913a9bdb2b 100644
--- a/be/src/vec/exec/format/text/text_reader.cpp
+++ b/be/src/vec/exec/format/text/text_reader.cpp
@@ -21,6 +21,9 @@
#include <gen_cpp/Types_types.h>
#include <glog/logging.h>
+#include <cstddef>
+#include <vector>
+
#include "common/compiler_util.h" // IWYU pragma: keep
#include "common/status.h"
#include "exec/line_reader.h"
@@ -39,6 +42,15 @@ namespace doris::vectorized {
#include "common/compile_check_begin.h"
void HiveTextFieldSplitter::do_split(const Slice& line, std::vector<Slice>*
splitted_values) {
+ if (_value_sep_len == 1) {
+ _split_field_single_char(line, splitted_values);
+ } else {
+ _split_field_multi_char(line, splitted_values);
+ }
+}
+
+void HiveTextFieldSplitter::_split_field_single_char(const Slice& line,
+ std::vector<Slice>*
splitted_values) {
const char* data = line.data;
const size_t size = line.size;
size_t value_start = 0;
@@ -55,6 +67,50 @@ void HiveTextFieldSplitter::do_split(const Slice& line,
std::vector<Slice>* spli
process_value_func(data, value_start, size - value_start, _trimming_char,
splitted_values);
}
+void HiveTextFieldSplitter::_split_field_multi_char(const Slice& line,
+ std::vector<Slice>*
splitted_values) {
+ const char* data = line.data;
+ const size_t size = line.size;
+ size_t start = 0;
+
+ std::vector<int> next(_value_sep_len);
+ next[0] = -1;
+ for (int i = 1, j = -1; i < (int)_value_sep_len; i++) {
+ while (j >= 0 && _value_sep[i] != _value_sep[j + 1]) {
+ j = next[j];
+ }
+ if (_value_sep[i] == _value_sep[j + 1]) {
+ j++;
+ }
+ next[i] = j;
+ }
+
+ // KMP search
+ for (int i = 0, j = -1; i < (int)size; i++) {
+ while (j >= 0 && data[i] != _value_sep[j + 1]) {
+ j = next[j];
+ }
+ if (data[i] == _value_sep[j + 1]) {
+ j++;
+ }
+ if (j == (int)_value_sep_len - 1) {
+ size_t curpos = i - _value_sep_len + 1;
+ if (_escape_char != 0 && curpos > 0 && data[curpos - 1] ==
_escape_char) {
+ j = next[j];
+ continue;
+ }
+
+ if (curpos >= start) {
+ process_value_func(data, start, curpos - start,
_trimming_char, splitted_values);
+ start = curpos + _value_sep_len;
+ }
+
+ j = next[j];
+ }
+ }
+ process_value_func(data, start, size - start, _trimming_char,
splitted_values);
+}
+
TextReader::TextReader(RuntimeState* state, RuntimeProfile* profile,
ScannerCounter* counter,
const TFileScanRangeParams& params, const
TFileRangeDesc& range,
const std::vector<SlotDescriptor*>& file_slot_descs,
io::IOContext* io_ctx)
diff --git a/be/src/vec/exec/format/text/text_reader.h
b/be/src/vec/exec/format/text/text_reader.h
index c1a873de5f2..88d6746a19f 100644
--- a/be/src/vec/exec/format/text/text_reader.h
+++ b/be/src/vec/exec/format/text/text_reader.h
@@ -43,6 +43,9 @@ public:
void do_split(const Slice& line, std::vector<Slice>* splitted_values);
private:
+ void _split_field_single_char(const Slice& line, std::vector<Slice>*
splitted_values);
+ void _split_field_multi_char(const Slice& line, std::vector<Slice>*
splitted_values);
+
std::string _value_sep;
char _escape_char;
};
diff --git a/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp
b/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp
new file mode 100644
index 00000000000..814af4554e4
--- /dev/null
+++ b/be/test/vec/exec/format/text/hive_text_field_splitter_test.cpp
@@ -0,0 +1,97 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include <gtest/gtest.h>
+
+#include <string>
+#include <vector>
+
+#include "vec/exec/format/text/text_reader.h"
+
+namespace doris::vectorized {
+
+class HiveTextFieldSplitterTest : public testing::Test {
+protected:
+ void verify_field_split(const std::string& input, const std::string&
delimiter,
+ const std::vector<std::string>& expected_fields,
char escape_char = 0) {
+ HiveTextFieldSplitter splitter(false, false, delimiter,
delimiter.size(), 0, escape_char);
+ Slice line(input.data(), input.size());
+ std::vector<Slice> splitted_values;
+
+ splitter.do_split(line, &splitted_values);
+
+ ASSERT_EQ(expected_fields.size(), splitted_values.size())
+ << "Input: " << input << ", Delimiter: " << delimiter;
+
+ for (size_t i = 0; i < expected_fields.size(); ++i) {
+ std::string actual(splitted_values[i].data,
splitted_values[i].size);
+ EXPECT_EQ(expected_fields[i], actual) << "Field " << i << "
mismatch. Input: " << input
+ << ", Delimiter: " <<
delimiter;
+ }
+ }
+};
+
+// Test single character delimiter (basic functionality)
+TEST_F(HiveTextFieldSplitterTest, SingleCharDelimiter) {
+ verify_field_split("a,b,c", ",", {"a", "b", "c"});
+ verify_field_split("1|2|3|4", "|", {"1", "2", "3", "4"});
+ verify_field_split("", ",", {""});
+ verify_field_split(",", ",", {"", ""});
+ verify_field_split("a,", ",", {"a", ""});
+ verify_field_split(",b", ",", {"", "b"});
+}
+
+// Test multi-character delimiter (core functionality for MultiDelimitSerDe)
+TEST_F(HiveTextFieldSplitterTest, MultiCharDelimiter) {
+ verify_field_split("a||b||c", "||", {"a", "b", "c"});
+ verify_field_split("1|+|2|+|3", "|+|", {"1", "2", "3"});
+ verify_field_split("field1|+|field2|+|field3", "|+|", {"field1", "field2",
"field3"});
+
+ verify_field_split("", "||", {""});
+ verify_field_split("||", "||", {"", ""});
+ verify_field_split("a||", "||", {"a", ""});
+ verify_field_split("||b", "||", {"", "b"});
+}
+
+// Test overlapping patterns in delimiter - these are the problematic cases
+TEST_F(HiveTextFieldSplitterTest, OverlappingPatterns) {
+ verify_field_split("ab\\ababab", "abab", {"ab\\", "ab"});
+
+ verify_field_split("aaaaaaa", "aaa", {"", "", "a"});
+
+ verify_field_split("abcabcabc", "abcabc", {"", "abc"});
+
+ verify_field_split("ababababab", "abab", {"", "", "ab"});
+}
+
+// Test escape character functionality
+TEST_F(HiveTextFieldSplitterTest, EscapeCharacter) {
+ verify_field_split("a\\,b,c", ",", {"a\\,b", "c"}, '\\');
+ verify_field_split("a\\||b||c", "||", {"a\\||b", "c"}, '\\');
+ verify_field_split("field1\\|+|field2|+|field3", "|+|",
{"field1\\|+|field2", "field3"}, '\\');
+}
+
+// Test real-world scenarios
+TEST_F(HiveTextFieldSplitterTest, RealWorldScenarios) {
+ verify_field_split("1|+|100|+|test1", "|+|", {"1", "100", "test1"});
+ verify_field_split("[email protected]|+|John Doe|+|Manager", "|+|",
+ {"[email protected]", "John Doe", "Manager"});
+ verify_field_split("|+||+|", "|+|", {"", "", ""});
+ verify_field_split("a|+||+|c", "|+|", {"a", "", "c"});
+}
+
+} // namespace doris::vectorized
\ No newline at end of file
diff --git
a/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
new file mode 100644
index 00000000000..cdaead8edf9
--- /dev/null
+++
b/docker/thirdparties/docker-compose/hive/scripts/data/regression/multi_delimit_serde/create_table.hql
@@ -0,0 +1,78 @@
+CREATE DATABASE IF NOT EXISTS regression;
+USE regression;
+
+CREATE TABLE `multi_delimit_test`(
+ `k1` int,
+ `k2` int,
+ `name` string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+ 'field.delim'='|+|',
+ 'mapkey.delim'='@',
+ 'collection.delim'=':',
+ 'serialization.format'='1',
+ 'serialization.encoding'='UTF-8')
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_test'
+TBLPROPERTIES (
+ 'transient_lastDdlTime'='1692719456');
+
+CREATE TABLE `multi_delimit_test2`(
+ `id` int,
+ `value` double,
+ `description` string)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+ 'field.delim'='||',
+ 'serialization.format'='1')
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_test2'
+TBLPROPERTIES (
+ 'transient_lastDdlTime'='1692719456');
+
+-- Test table with array and map types to test collection.delim and
mapkey.delim
+CREATE TABLE `multi_delimit_complex_test`(
+ `id` int,
+ `name` string,
+ `tags` array<string>,
+ `properties` map<string,string>,
+ `nested_array` array<array<int>>)
+ROW FORMAT SERDE
+ 'org.apache.hadoop.hive.serde2.MultiDelimitSerDe'
+WITH SERDEPROPERTIES (
+ 'field.delim'='|+|',
+ 'mapkey.delim'='@',
+ 'collection.delim'=':',
+ 'serialization.format'='1',
+ 'serialization.encoding'='UTF-8')
+STORED AS INPUTFORMAT
+ 'org.apache.hadoop.mapred.TextInputFormat'
+OUTPUTFORMAT
+ 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
+LOCATION '/user/doris/suites/regression/multi_delimit_complex_test'
+TBLPROPERTIES (
+ 'transient_lastDdlTime'='1692719456');
+
+INSERT INTO multi_delimit_test VALUES
+ (1, 100, 'test1'),
+ (2, 200, 'test2'),
+ (3, 300, 'test3');
+
+INSERT INTO multi_delimit_test2 VALUES
+ (1, 1.5, 'description1'),
+ (2, 2.5, 'description2'),
+ (3, 3.5, 'description3');
+
+-- Insert test data with complex types
+-- Format:
id|+|name|+|array_elements:separated:by:colon|+|key1@value1:key2@value2|+|nested_array_format
+INSERT INTO multi_delimit_complex_test VALUES
+ (1, 'user1', array('tag1', 'tag2', 'tag3'), map('key1', 'value1', 'key2',
'value2'), array(array(1, 2), array(3, 4))),
+ (2, 'user2', array('tagA', 'tagB'), map('prop1', 'val1'), array(array(5,
6)));
\ No newline at end of file
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
index f59143961e6..eec4349669c 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveMetaStoreClientHelper.java
@@ -98,6 +98,7 @@ public class HiveMetaStoreClientHelper {
public static final String OPENX_JSON_SERDE =
"org.openx.data.jsonserde.JsonSerDe";
public static final String HIVE_TEXT_SERDE =
"org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe";
public static final String HIVE_CSV_SERDE =
"org.apache.hadoop.hive.serde2.OpenCSVSerde";
+ public static final String HIVE_MULTI_DELIMIT_SERDE =
"org.apache.hadoop.hive.serde2.MultiDelimitSerDe";
public enum HiveFileFormat {
TEXT_FILE(0, "text"),
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
index bdc8e0cacd9..1be78e41b89 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/HiveProperties.java
@@ -81,11 +81,16 @@ public class HiveProperties {
PROP_SKIP_FOOTER_COUNT);
public static String getFieldDelimiter(Table table) {
+ return getFieldDelimiter(table, false);
+ }
+
+ public static String getFieldDelimiter(Table table, boolean
supportMultiChar) {
// This method is used for text format.
Optional<String> fieldDelim =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_FIELD_DELIMITER);
Optional<String> serFormat =
HiveMetaStoreClientHelper.getSerdeProperty(table, PROP_SERIALIZATION_FORMAT);
- return
HiveMetaStoreClientHelper.getByte(HiveMetaStoreClientHelper.firstPresentOrDefault(
- DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat));
+ String delimiter = HiveMetaStoreClientHelper.firstPresentOrDefault(
+ DEFAULT_FIELD_DELIMITER, fieldDelim, serFormat);
+ return supportMultiChar ? delimiter :
HiveMetaStoreClientHelper.getByte(delimiter);
}
public static String getSeparatorChar(Table table) {
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
index e096ee30e72..0b316efaa3a 100644
---
a/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
+++
b/fe/fe-core/src/main/java/org/apache/doris/datasource/hive/source/HiveScanNode.java
@@ -431,6 +431,8 @@ public class HiveScanNode extends FileQueryScanNode {
type = TFileFormatType.FORMAT_TEXT;
} else if
(serDeLib.equals(HiveMetaStoreClientHelper.HIVE_CSV_SERDE)) {
type = TFileFormatType.FORMAT_CSV_PLAIN;
+ } else if
(serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE)) {
+ type = TFileFormatType.FORMAT_TEXT;
} else {
throw new UserException("Unsupported hive table serde: " +
serDeLib);
}
@@ -451,11 +453,13 @@ public class HiveScanNode extends FileQueryScanNode {
// TODO: support skip footer count
fileAttributes.setSkipLines(HiveProperties.getSkipHeaderCount(table));
String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
- if
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")) {
+ if
(serDeLib.equals("org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe")
+ ||
serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE)) {
TFileTextScanRangeParams textParams = new
TFileTextScanRangeParams();
- // set properties of LazySimpleSerDe
- // 1. set column separator
-
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table));
+ // set properties of LazySimpleSerDe and MultiDelimitSerDe
+ // 1. set column separator (MultiDelimitSerDe supports
multi-character delimiters)
+ boolean supportMultiChar =
serDeLib.equals(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE);
+
textParams.setColumnSeparator(HiveProperties.getFieldDelimiter(table,
supportMultiChar));
// 2. set line delimiter
textParams.setLineDelimiter(HiveProperties.getLineDelimiter(table));
// 3. set mapkv delimiter
diff --git
a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
b/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
index bb4786f226a..fdbc1ffc948 100644
--- a/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/planner/HiveTableSink.java
@@ -25,6 +25,7 @@ import org.apache.doris.common.AnalysisException;
import org.apache.doris.common.util.LocationPath;
import org.apache.doris.datasource.hive.HMSExternalCatalog;
import org.apache.doris.datasource.hive.HMSExternalTable;
+import org.apache.doris.datasource.hive.HiveMetaStoreClientHelper;
import org.apache.doris.datasource.hive.HiveProperties;
import
org.apache.doris.nereids.trees.plans.commands.insert.HiveInsertCommandContext;
import
org.apache.doris.nereids.trees.plans.commands.insert.InsertCommandContext;
@@ -59,6 +60,7 @@ public class HiveTableSink extends BaseExternalTableDataSink {
add(TFileFormatType.FORMAT_CSV_PLAIN);
add(TFileFormatType.FORMAT_ORC);
add(TFileFormatType.FORMAT_PARQUET);
+ add(TFileFormatType.FORMAT_TEXT);
}};
public HiveTableSink(HMSExternalTable targetTable) {
@@ -175,6 +177,7 @@ public class HiveTableSink extends
BaseExternalTableDataSink {
compressType =
targetTable.getRemoteTable().getParameters().get("parquet.compression");
break;
case FORMAT_CSV_PLAIN:
+ case FORMAT_TEXT:
compressType =
targetTable.getRemoteTable().getParameters().get("text.compression");
if (Strings.isNullOrEmpty(compressType)) {
compressType =
ConnectContext.get().getSessionVariable().hiveTextCompression();
@@ -213,8 +216,13 @@ public class HiveTableSink extends
BaseExternalTableDataSink {
private void setSerDeProperties(THiveTableSink tSink) {
THiveSerDeProperties serDeProperties = new THiveSerDeProperties();
Table table = targetTable.getRemoteTable();
+ String serDeLib = table.getSd().getSerdeInfo().getSerializationLib();
// 1. set field delimiter
- serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table));
+ if
(HiveMetaStoreClientHelper.HIVE_MULTI_DELIMIT_SERDE.equals(serDeLib)) {
+
serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table, true));
+ } else {
+
serDeProperties.setFieldDelim(HiveProperties.getFieldDelimiter(table));
+ }
// 2. set line delimiter
serDeProperties.setLineDelim(HiveProperties.getLineDelimiter(table));
// 3. set collection delimiter
diff --git
a/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out
b/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out
new file mode 100644
index 00000000000..f2bac26a2a1
Binary files /dev/null and
b/regression-test/data/external_table_p0/hive/test_multi_delimit_serde.out
differ
diff --git
a/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy
b/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy
new file mode 100644
index 00000000000..8823a169ff5
--- /dev/null
+++
b/regression-test/suites/external_table_p0/hive/test_multi_delimit_serde.groovy
@@ -0,0 +1,84 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+suite("test_multi_delimit_serde",
"p0,external,hive,external_docker,external_docker_hive") {
+ String enabled = context.config.otherConfigs.get("enableHiveTest")
+ if (enabled == null || !enabled.equalsIgnoreCase("true")) {
+ logger.info("disable Hive test.")
+ return;
+ }
+
+ for (String hivePrefix : ["hive2", "hive3"]) {
+ String hms_port = context.config.otherConfigs.get(hivePrefix +
"HmsPort")
+ String catalog_name = "${hivePrefix}_test_multi_delimit_serde"
+ String externalEnvIp = context.config.otherConfigs.get("externalEnvIp")
+
+ sql """drop catalog if exists ${catalog_name}"""
+ sql """create catalog if not exists ${catalog_name} properties (
+ "type"="hms",
+ 'hive.metastore.uris' = 'thrift://${externalEnvIp}:${hms_port}'
+ );"""
+
+ logger.info("catalog " + catalog_name + " created")
+ sql """switch ${catalog_name};"""
+ logger.info("switched to catalog " + catalog_name)
+
+ sql """use regression;"""
+
+ try {
+ // Test 1: MultiDelimitSerDe with |+| delimiter - using
pre-created table
+ qt_01 """SELECT * FROM multi_delimit_test ORDER BY k1"""
+
+ // Test 2: Different multi-character delimiter - using pre-created
table
+ qt_02 """SELECT * FROM multi_delimit_test2 ORDER BY id"""
+
+ // Test 3: Complex types with array and map to test
collection.delim and mapkey.delim
+ logger.info("Test 3: Using pre-created table with array and map
types")
+ qt_03 """SELECT id, name, tags, properties FROM
multi_delimit_complex_test ORDER BY id"""
+
+ // Test 4: Insert data using Doris to write to Hive
MultiDelimitSerDe tables
+ logger.info("Test 4: Testing Doris INSERT to Hive
MultiDelimitSerDe tables")
+
+ // Test 4.1: Insert to basic multi-delimit table
+ sql """INSERT INTO multi_delimit_test VALUES (4, 400, 'test4'),
(5, 500, 'test5')"""
+ qt_04 """SELECT * FROM multi_delimit_test WHERE k1 >= 4 ORDER BY
k1"""
+
+ // Test 4.2: Insert to double-pipe delimited table
+ sql """INSERT INTO multi_delimit_test2 VALUES (4, 4.5,
'description4'), (5, 5.5, 'description5')"""
+ qt_05 """SELECT * FROM multi_delimit_test2 WHERE id >= 4 ORDER BY
id"""
+
+ // Test 4.3: Insert to complex types table with arrays and maps
+ sql """INSERT INTO multi_delimit_complex_test VALUES
+ (3, 'user3', ARRAY('tagX', 'tagY'), MAP('newkey', 'newvalue'),
ARRAY(ARRAY(7, 8)))"""
+ qt_06 """SELECT id, name, tags, properties FROM
multi_delimit_complex_test WHERE id = 3 ORDER BY id"""
+
+ // Test 5: Show create table to check SerDe properties
+ logger.info("Test 5: Checking show create table")
+ def createTableResult = sql """SHOW CREATE TABLE
multi_delimit_test"""
+ logger.info("Create table result: " + createTableResult.toString())
+
+
assertTrue(createTableResult.toString().contains("MultiDelimitSerDe"))
+ assertTrue(createTableResult.toString().contains("field.delim"))
+ } catch (Exception e) {
+ logger.warn("Test failed, this might be expected if Hive version
doesn't support MultiDelimitSerDe: " + e.getMessage())
+ if (e.getMessage().contains("Unsupported hive table serde")) {
+ logger.info("Got expected 'Unsupported hive table serde' error
before implementing MultiDelimitSerDe support")
+ }
+ }
+ sql """drop catalog if exists ${catalog_name}"""
+ }
+}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]