This is an automated email from the ASF dual-hosted git repository.
yiguolei pushed a commit to branch branch-4.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-4.0 by this push:
new f5b8f317208 branch-4.0: [fix](load) fix multi byte char load #56353
(#56517)
f5b8f317208 is described below
commit f5b8f3172084776f4718077c518595ee399812fc
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Sat Sep 27 21:47:28 2025 +0800
branch-4.0: [fix](load) fix multi byte char load #56353 (#56517)
Cherry-picked from #56353
Co-authored-by: TengJianPing <[email protected]>
---
be/src/vec/sink/vtablet_block_convertor.cpp | 70 +++++++++++--------
.../test_insert_strict_mode_and_filter_ratio.out | 12 ++++
...test_insert_strict_mode_and_filter_ratio.groovy | 78 ++++++++++++++++++++++
3 files changed, 131 insertions(+), 29 deletions(-)
diff --git a/be/src/vec/sink/vtablet_block_convertor.cpp
b/be/src/vec/sink/vtablet_block_convertor.cpp
index c88de1a5dd7..521febba99a 100644
--- a/be/src/vec/sink/vtablet_block_convertor.cpp
+++ b/be/src/vec/sink/vtablet_block_convertor.cpp
@@ -235,35 +235,12 @@ Status OlapTableBlockConvertor::_internal_validate_column(
}
if (invalid_count) {
- if (state->enable_insert_strict()) {
- for (size_t j = 0; j < row_count; ++j) {
- auto row = rows ? (*rows)[j] : j;
- if (need_to_validate(j, row)) {
- auto str_val = column_string->get_data_at(j);
- bool invalid = str_val.size > limit;
- if (invalid) {
- if (str_val.size > len) {
- fmt::format_to(error_msg, "{}",
- "the length of input is too
long than schema. ");
- fmt::format_to(error_msg, "first 32 bytes of
input str: [{}] ",
- str_val.to_prefix(32));
- fmt::format_to(error_msg, "schema length: {};
", len);
- fmt::format_to(error_msg, "actual length: {};
", str_val.size);
- } else if (str_val.size > limit) {
- fmt::format_to(
- error_msg, "{}",
- "the length of input string is too
long than vec schema. ");
- fmt::format_to(error_msg, "first 32 bytes of
input str: [{}] ",
- str_val.to_prefix(32));
- fmt::format_to(error_msg, "schema length: {};
", len);
- fmt::format_to(error_msg, "limit length: {};
", limit);
- fmt::format_to(error_msg, "actual length: {};
", str_val.size);
- }
-
RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
- }
- }
- }
- } else if (type_str) {
+ // For string column, if in non-strict load mode(for both insert
stmt and stream load),
+ // truncate the string to schema len.
+ // After truncation, still need to check if byte len of each row
exceed the schema len,
+ // because currently the schema len is defined in bytes, and
substring works by unit of chars.
+ // This is a workaround for now, need to improve it after better
support of multi-byte chars.
+ if (type_str && !state->enable_insert_strict()) {
ColumnsWithTypeAndName argument_template;
auto pos_type = DataTypeFactory::instance().create_data_type(
FieldType::OLAP_FIELD_TYPE_INT, 0, 0);
@@ -287,6 +264,41 @@ Status OlapTableBlockConvertor::_internal_validate_column(
RETURN_IF_ERROR(func->execute(nullptr, tmp_block, {0, 1, 2},
3, row_count));
block->get_by_position(slot_index).column =
std::move(tmp_block.get_by_position(3).column);
+ const auto* tmp_column_ptr =
+
vectorized::check_and_get_column<vectorized::ColumnNullable>(
+ *block->get_by_position(slot_index).column);
+ const auto& tmp_real_column_ptr =
+ tmp_column_ptr == nullptr ?
block->get_by_position(slot_index).column
+ :
(tmp_column_ptr->get_nested_column_ptr());
+ column_string =
+ assert_cast<const
vectorized::ColumnString*>(tmp_real_column_ptr.get());
+ }
+ for (size_t j = 0; j < row_count; ++j) {
+ auto row = rows ? (*rows)[j] : j;
+ if (need_to_validate(j, row)) {
+ auto str_val = column_string->get_data_at(j);
+ bool invalid = str_val.size > limit;
+ if (invalid) {
+ if (str_val.size > len) {
+ fmt::format_to(error_msg, "{}",
+ "the length of input is too long
than schema. ");
+ fmt::format_to(error_msg, "first 32 bytes of input
str: [{}] ",
+ str_val.to_prefix(32));
+ fmt::format_to(error_msg, "schema length: {}; ",
len);
+ fmt::format_to(error_msg, "actual length: {}; ",
str_val.size);
+ } else if (str_val.size > limit) {
+ fmt::format_to(
+ error_msg, "{}",
+ "the length of input string is too long
than vec schema. ");
+ fmt::format_to(error_msg, "first 32 bytes of input
str: [{}] ",
+ str_val.to_prefix(32));
+ fmt::format_to(error_msg, "schema length: {}; ",
len);
+ fmt::format_to(error_msg, "limit length: {}; ",
limit);
+ fmt::format_to(error_msg, "actual length: {}; ",
str_val.size);
+ }
+ RETURN_IF_ERROR(set_invalid_and_append_error_msg(row));
+ }
+ }
}
}
return Status::OK();
diff --git
a/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
b/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
index 1f431345896..7872fa4bef8 100644
---
a/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
+++
b/regression-test/data/load_p0/insert/test_insert_strict_mode_and_filter_ratio.out
@@ -78,3 +78,15 @@
-- !sql_string_exceed_len_strict1 --
+-- !sql_mb_string_exceed_len_non_strict0 --
+
+-- !sql_mb_string_exceed_len_non_strict1 --
+1 a
+2 b
+3 c
+4 d
+5 e
+6 f
+
+-- !sql_mb_string_exceed_len_strict0 --
+
diff --git
a/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
b/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
index 893dfc5dec0..5fc9a00d086 100644
---
a/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
+++
b/regression-test/suites/load_p0/insert/test_insert_strict_mode_and_filter_ratio.groovy
@@ -406,4 +406,82 @@ suite("test_insert_strict_mode_and_filter_ratio","p0") {
exception """url"""
}
qt_sql_string_exceed_len_strict1 "select * from
test_insert_strict_mode_and_filter_ratio order by 1"
+
+ // TODO: change the following test case when BE support mbstring length
check
+ // 6 test Chinese char
+ // 6.1 string exceed schema length, enable_insert_strict=false,
insert_max_filter_ratio=0.3, load fail
+ sql """
+ drop table if exists test_insert_strict_mode_and_filter_ratio;
+ """
+ sql """
+ create table test_insert_strict_mode_and_filter_ratio (
+ id int,
+ name char(1)
+ ) properties ('replication_num' = '1');
+ """
+ sql "set enable_insert_strict=false"
+ sql "set enable_strict_cast=true"
+ sql "set insert_max_filter_ratio=0.3"
+ test {
+ sql """
+ insert into test_insert_strict_mode_and_filter_ratio values
+ (1, "a"),
+ (2, "b"),
+ (3, "c"),
+ (4, "d"),
+ (5, "e"),
+ (6, "f"),
+ (7, "宅z"),
+ (8, "兹z"),
+ (9, "中z"),
+ (10, "国g");
+ """
+ exception """Insert has too many filtered data"""
+ }
+ qt_sql_mb_string_exceed_len_non_strict0 "select * from
test_insert_strict_mode_and_filter_ratio order by 1"
+
+ // 6.2 string exceed schema length, enable_insert_strict=false,
insert_max_filter_ratio=0.4, load success
+ sql """
+ truncate table test_insert_strict_mode_and_filter_ratio;
+ """
+ sql "set insert_max_filter_ratio=0.4"
+ sql """
+ insert into test_insert_strict_mode_and_filter_ratio values
+ (1, "a"),
+ (2, "b"),
+ (3, "c"),
+ (4, "d"),
+ (5, "e"),
+ (6, "f"),
+ (7, "宅z"),
+ (8, "兹z"),
+ (9, "中z"),
+ (10, "国g");
+ """
+ qt_sql_mb_string_exceed_len_non_strict1 "select * from
test_insert_strict_mode_and_filter_ratio order by 1"
+
+ // 6.3 string exceed schema length, enable_insert_strict=true,
insert_max_filter_ratio=1, load fail
+ sql """
+ truncate table test_insert_strict_mode_and_filter_ratio;
+ """
+ sql "set enable_insert_strict=true"
+ sql "set enable_strict_cast=false"
+ sql "set insert_max_filter_ratio=1"
+ test {
+ sql """
+ insert into test_insert_strict_mode_and_filter_ratio values
+ (1, "a"),
+ (2, "b"),
+ (3, "c"),
+ (4, "d"),
+ (5, "e"),
+ (6, "f"),
+ (7, "宅z"),
+ (8, "兹z"),
+ (9, "中z"),
+ (10, "国g");
+ """
+ exception """Insert has filtered data in strict mode"""
+ }
+ qt_sql_mb_string_exceed_len_strict0 "select * from
test_insert_strict_mode_and_filter_ratio order by 1"
}
\ No newline at end of file
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]