This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new 582be130dd [Feature] (ODBC) support read/write emoji of utf16 via odbc table (#11863) 582be130dd is described below commit 582be130dd08cfe1d5ab390d7237e67aa2a0837f Author: HappenLee <happen...@hotmail.com> AuthorDate: Thu Aug 18 09:09:02 2022 +0800 [Feature] (ODBC) support read/write emoji of utf16 via odbc table (#11863) Co-authored-by: lihaopeng <lihaop...@baidu.com> --- be/src/exec/odbc_connector.cpp | 26 +++++++--------------- .../docs/ecosystem/external-table/odbc-of-doris.md | 5 ++++- .../docs/ecosystem/external-table/odbc-of-doris.md | 4 ++++ .../main/java/org/apache/doris/catalog/Env.java | 1 + 4 files changed, 17 insertions(+), 19 deletions(-) diff --git a/be/src/exec/odbc_connector.cpp b/be/src/exec/odbc_connector.cpp index 5ca74080df..169b626726 100644 --- a/be/src/exec/odbc_connector.cpp +++ b/be/src/exec/odbc_connector.cpp @@ -48,14 +48,9 @@ static constexpr uint32_t BIG_COLUMN_SIZE_BUFFER = 65535; // Default max buffer size use in insert to: 50MB, normally a batch is smaller than the size static constexpr uint32_t INSERT_BUFFER_SIZE = 1024l * 1024 * 50; -static doris::Status utf8_to_wstring(const std::string& str, std::u16string& out) { - std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> utf8_ucs2_cvt; - try { - out = utf8_ucs2_cvt.from_bytes(str); - } catch (std::range_error& e) { - return doris::Status::InternalError("UNICODE out of supported range"); - } - return doris::Status::OK(); +static std::u16string utf8_to_u16string(const char* first, const char* last) { + std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> utf8_utf16_cvt; + return utf8_utf16_cvt.from_bytes(first, last); } namespace doris { @@ -133,8 +128,7 @@ Status ODBCConnector::query() { "alloc statement"); // Translate utf8 string to utf16 to use unicode encoding - std::u16string wquery; - RETURN_IF_ERROR(utf8_to_wstring(_sql_str, wquery)); + auto wquery = utf8_to_u16string(_sql_str.c_str(), _sql_str.c_str() + _sql_str.length()); ODBC_DISPOSE(_stmt, SQL_HANDLE_STMT, SQLExecDirectW(_stmt, (SQLWCHAR*)(wquery.c_str()), SQL_NTS), "exec direct"); @@ -313,10 +307,8 @@ Status ODBCConnector::append(const std::string& table_name, RowBatch* batch, } } // Translate utf8 string to utf16 to use unicode encodeing - RETURN_IF_ERROR(utf8_to_wstring( - std::string(_insert_stmt_buffer.data(), - _insert_stmt_buffer.data() + _insert_stmt_buffer.size()), - insert_stmt)); + insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(), + _insert_stmt_buffer.data() + _insert_stmt_buffer.size()); } { @@ -499,10 +491,8 @@ Status ODBCConnector::append(const std::string& table_name, vectorized::Block* b } } // Translate utf8 string to utf16 to use unicode encodeing - RETURN_IF_ERROR(utf8_to_wstring( - std::string(_insert_stmt_buffer.data(), - _insert_stmt_buffer.data() + _insert_stmt_buffer.size()), - insert_stmt)); + insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(), + _insert_stmt_buffer.data() + _insert_stmt_buffer.size()); } { diff --git a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md index 2d850f8073..5c3c35595b 100644 --- a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md +++ b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md @@ -381,5 +381,8 @@ This is the compatibility problem between MySQL database ODBC driver and existin Connection to the database fails. The` Err: part` represents the error of different database connection failures. This is usually a configuration problem. You should check whether the IP address, port or account password are mismatched. - + 11. Messy code appears when reading and writing emoji emoji in mysql odbc table + + The default encoding used by Doris when connecting to odbc tables is utf8, since the default utf8 encoding in mysql is utf8mb3, it can't represent the emoji expressions which need 4-byte encoding. Here need to set `charset`=`utf8mb4` when you create odbc mysql tables, then can read and write emoji normally 😀. + diff --git a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md index 8d1df916d6..05cb3d81a1 100644 --- a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md +++ b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md @@ -371,3 +371,7 @@ sudo alien -i oracle-instantclient19.13-sqlplus-19.13.0.0.0-2.x86_64.rpm 10. 报错`driver connect Err: xxx` 通常是连接数据库失败,Err部分代表了不同的数据库连接失败的报错。这种情况通常是配置存在问题。可以检查是否错配了ip地址,端口或账号密码。 + +11. 读写mysql外表的emoji表情出现乱码 + + Doris进行odbc外表连接时,默认采用的编码为utf8,由于mysql之中默认的utf8编码为utf8mb3,无法表示需要4字节编码的emoji表情。这里需要在建立mysql外表时设置`charset`=`utf8mb4`,便可以正常读写emoji表情😀。 diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java index 53ddeab48d..75208057c5 100755 --- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java +++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java @@ -2954,6 +2954,7 @@ public class Env { sb.append("\"password\" = \"").append(hidePassword ? "" : odbcTable.getPasswd()).append("\",\n"); sb.append("\"driver\" = \"").append(odbcTable.getOdbcDriver()).append("\",\n"); sb.append("\"odbc_type\" = \"").append(odbcTable.getOdbcTableTypeName()).append("\",\n"); + sb.append("\"charest\" = \"").append(odbcTable.getCharset()).append("\",\n"); } else { sb.append("\"odbc_catalog_resource\" = \"").append(odbcTable.getOdbcCatalogResourceName()) .append("\",\n"); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org