This is an automated email from the ASF dual-hosted git repository.

yiguolei pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 582be130dd [Feature] (ODBC) support read/write emoji of utf16 via odbc 
table (#11863)
582be130dd is described below

commit 582be130dd08cfe1d5ab390d7237e67aa2a0837f
Author: HappenLee <happen...@hotmail.com>
AuthorDate: Thu Aug 18 09:09:02 2022 +0800

    [Feature] (ODBC) support read/write emoji of utf16 via odbc table (#11863)
    
    
    Co-authored-by: lihaopeng <lihaop...@baidu.com>
---
 be/src/exec/odbc_connector.cpp                     | 26 +++++++---------------
 .../docs/ecosystem/external-table/odbc-of-doris.md |  5 ++++-
 .../docs/ecosystem/external-table/odbc-of-doris.md |  4 ++++
 .../main/java/org/apache/doris/catalog/Env.java    |  1 +
 4 files changed, 17 insertions(+), 19 deletions(-)

diff --git a/be/src/exec/odbc_connector.cpp b/be/src/exec/odbc_connector.cpp
index 5ca74080df..169b626726 100644
--- a/be/src/exec/odbc_connector.cpp
+++ b/be/src/exec/odbc_connector.cpp
@@ -48,14 +48,9 @@ static constexpr uint32_t BIG_COLUMN_SIZE_BUFFER = 65535;
 // Default max buffer size use in insert to: 50MB, normally a batch is smaller 
than the size
 static constexpr uint32_t INSERT_BUFFER_SIZE = 1024l * 1024 * 50;
 
-static doris::Status utf8_to_wstring(const std::string& str, std::u16string& 
out) {
-    std::wstring_convert<std::codecvt_utf8<char16_t>, char16_t> utf8_ucs2_cvt;
-    try {
-        out = utf8_ucs2_cvt.from_bytes(str);
-    } catch (std::range_error& e) {
-        return doris::Status::InternalError("UNICODE out of supported range");
-    }
-    return doris::Status::OK();
+static std::u16string utf8_to_u16string(const char* first, const char* last) {
+    std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t> 
utf8_utf16_cvt;
+    return utf8_utf16_cvt.from_bytes(first, last);
 }
 
 namespace doris {
@@ -133,8 +128,7 @@ Status ODBCConnector::query() {
                  "alloc statement");
 
     // Translate utf8 string to utf16 to use unicode encoding
-    std::u16string wquery;
-    RETURN_IF_ERROR(utf8_to_wstring(_sql_str, wquery));
+    auto wquery = utf8_to_u16string(_sql_str.c_str(), _sql_str.c_str() + 
_sql_str.length());
     ODBC_DISPOSE(_stmt, SQL_HANDLE_STMT,
                  SQLExecDirectW(_stmt, (SQLWCHAR*)(wquery.c_str()), SQL_NTS), 
"exec direct");
 
@@ -313,10 +307,8 @@ Status ODBCConnector::append(const std::string& 
table_name, RowBatch* batch,
             }
         }
         // Translate utf8 string to utf16 to use unicode encodeing
-        RETURN_IF_ERROR(utf8_to_wstring(
-                std::string(_insert_stmt_buffer.data(),
-                            _insert_stmt_buffer.data() + 
_insert_stmt_buffer.size()),
-                insert_stmt));
+        insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(),
+                                        _insert_stmt_buffer.data() + 
_insert_stmt_buffer.size());
     }
 
     {
@@ -499,10 +491,8 @@ Status ODBCConnector::append(const std::string& 
table_name, vectorized::Block* b
             }
         }
         // Translate utf8 string to utf16 to use unicode encodeing
-        RETURN_IF_ERROR(utf8_to_wstring(
-                std::string(_insert_stmt_buffer.data(),
-                            _insert_stmt_buffer.data() + 
_insert_stmt_buffer.size()),
-                insert_stmt));
+        insert_stmt = utf8_to_u16string(_insert_stmt_buffer.data(),
+                                        _insert_stmt_buffer.data() + 
_insert_stmt_buffer.size());
     }
 
     {
diff --git a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md 
b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
index 2d850f8073..5c3c35595b 100644
--- a/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
+++ b/docs/en/docs/ecosystem/external-table/odbc-of-doris.md
@@ -381,5 +381,8 @@ This is the compatibility problem between MySQL database 
ODBC driver and existin
 
     Connection to the database fails. The` Err: part` represents the error of 
different database connection failures. This is usually a configuration 
problem. You should check whether the IP address, port or account password are 
mismatched.
 
-    
+ 11. Messy code appears when reading and writing emoji emoji in mysql odbc 
table
+
+    The default encoding used by Doris when connecting to odbc tables is utf8, 
since the default utf8 encoding in mysql is utf8mb3, it can't represent the 
emoji expressions which need 4-byte encoding. Here need to set 
`charset`=`utf8mb4` when you create odbc mysql tables, then can read and write 
emoji normally 😀.
+
 
diff --git a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md 
b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
index 8d1df916d6..05cb3d81a1 100644
--- a/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
+++ b/docs/zh-CN/docs/ecosystem/external-table/odbc-of-doris.md
@@ -371,3 +371,7 @@ sudo alien -i  
oracle-instantclient19.13-sqlplus-19.13.0.0.0-2.x86_64.rpm
 10. 报错`driver connect Err: xxx`
 
     通常是连接数据库失败,Err部分代表了不同的数据库连接失败的报错。这种情况通常是配置存在问题。可以检查是否错配了ip地址,端口或账号密码。
+    
+11. 读写mysql外表的emoji表情出现乱码
+
+    
Doris进行odbc外表连接时,默认采用的编码为utf8,由于mysql之中默认的utf8编码为utf8mb3,无法表示需要4字节编码的emoji表情。这里需要在建立mysql外表时设置`charset`=`utf8mb4`,便可以正常读写emoji表情😀。
diff --git a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java 
b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
index 53ddeab48d..75208057c5 100755
--- a/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
+++ b/fe/fe-core/src/main/java/org/apache/doris/catalog/Env.java
@@ -2954,6 +2954,7 @@ public class Env {
                 sb.append("\"password\" = \"").append(hidePassword ? "" : 
odbcTable.getPasswd()).append("\",\n");
                 sb.append("\"driver\" = 
\"").append(odbcTable.getOdbcDriver()).append("\",\n");
                 sb.append("\"odbc_type\" = 
\"").append(odbcTable.getOdbcTableTypeName()).append("\",\n");
+                sb.append("\"charest\" = 
\"").append(odbcTable.getCharset()).append("\",\n");
             } else {
                 sb.append("\"odbc_catalog_resource\" = 
\"").append(odbcTable.getOdbcCatalogResourceName())
                         .append("\",\n");


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to