This is an automated email from the ASF dual-hosted git repository.

eldenmoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new 195fb730d40 [improve](serde) support json string format with escaped 
charactors (#37176)
195fb730d40 is described below

commit 195fb730d409c522da7bf03705cf9adb633fb679
Author: amory <wangqian...@selectdb.com>
AuthorDate: Thu Jul 4 15:06:06 2024 +0800

    [improve](serde) support json string format with escaped charactors (#37176)
    
    before this pr
    if we use some escape from nested types like array, we can not make this
    string cast to json
---
 .../vec/data_types/serde/data_type_string_serde.h  | 46 ++++++++++++-
 be/src/vec/functions/function_cast.h               |  1 +
 regression-test/data/jsonb_p0/test_jsonb_cast.csv  |  4 ++
 regression-test/data/jsonb_p0/test_jsonb_cast.out  | 27 ++++++++
 .../suites/jsonb_p0/test_jsonb_cast.groovy         | 79 ++++++++++++++++++++++
 5 files changed, 154 insertions(+), 3 deletions(-)

diff --git a/be/src/vec/data_types/serde/data_type_string_serde.h 
b/be/src/vec/data_types/serde/data_type_string_serde.h
index b74b5857086..24f99a12e67 100644
--- a/be/src/vec/data_types/serde/data_type_string_serde.h
+++ b/be/src/vec/data_types/serde/data_type_string_serde.h
@@ -73,19 +73,59 @@ public:
         auto result = check_column_const_set_readability(column, row_num);
         ColumnPtr ptr = result.first;
         row_num = result.second;
+        const auto& value = assert_cast<const 
ColumnType&>(*ptr).get_data_at(row_num);
 
         if (_nesting_level > 1) {
             bw.write('"');
         }
-
-        const auto& value = assert_cast<const 
ColumnType&>(*ptr).get_data_at(row_num);
-        bw.write(value.data, value.size);
+        if constexpr (std::is_same_v<ColumnType, ColumnString>) {
+            if (options.escape_char != 0) {
+                // we should make deal with some special characters in json 
str if we have escape_char
+                StringRef str_ref = value;
+                write_with_escaped_char_to_json(str_ref, bw);
+            } else {
+                bw.write(value.data, value.size);
+            }
+        } else {
+            bw.write(value.data, value.size);
+        }
         if (_nesting_level > 1) {
             bw.write('"');
         }
+
         return Status::OK();
     }
 
+    inline void write_with_escaped_char_to_json(StringRef value, 
BufferWritable& bw) const {
+        for (char it : value) {
+            switch (it) {
+            case '\b':
+                bw.write("\\b", 2);
+                break;
+            case '\f':
+                bw.write("\\f", 2);
+                break;
+            case '\n':
+                bw.write("\\n", 2);
+                break;
+            case '\r':
+                bw.write("\\r", 2);
+                break;
+            case '\t':
+                bw.write("\\t", 2);
+                break;
+            case '\\':
+                bw.write("\\\\", 2);
+                break;
+            case '"':
+                bw.write("\\\"", 2);
+                break;
+            default:
+                bw.write(it);
+            }
+        }
+    }
+
     Status serialize_column_to_json(const IColumn& column, int start_idx, int 
end_idx,
                                     BufferWritable& bw, FormatOptions& 
options) const override {
         SERIALIZE_COLUMN_TO_JSON();
diff --git a/be/src/vec/functions/function_cast.h 
b/be/src/vec/functions/function_cast.h
index 17250e10fd7..d4b21aacc5c 100644
--- a/be/src/vec/functions/function_cast.h
+++ b/be/src/vec/functions/function_cast.h
@@ -766,6 +766,7 @@ struct ConvertImplGenericToJsonb {
 
         auto tmp_col = ColumnString::create();
         vectorized::DataTypeSerDe::FormatOptions options;
+        options.escape_char = '\\';
         for (size_t i = 0; i < input_rows_count; i++) {
             // convert to string
             tmp_col->clear();
diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.csv 
b/regression-test/data/jsonb_p0/test_jsonb_cast.csv
new file mode 100644
index 00000000000..08b694ddea8
--- /dev/null
+++ b/regression-test/data/jsonb_p0/test_jsonb_cast.csv
@@ -0,0 +1,4 @@
+1      \N
+2      ['{\'x\' : \'{"y" : 1}\', \'t\' : \'{"y" : 2}\'}', '{"x" : 1}']
+3      ['foo\'bar', 'foo"bar', 'foo\\'bar', 'foo\'\'bar']
+4      ['\/some\/cool\/url', '/some/cool/url', 
'a\\_\\c\\l\\i\\c\\k\\h\\o\\u\\s\\e']
\ No newline at end of file
diff --git a/regression-test/data/jsonb_p0/test_jsonb_cast.out 
b/regression-test/data/jsonb_p0/test_jsonb_cast.out
new file mode 100644
index 00000000000..2ab4174c746
--- /dev/null
+++ b/regression-test/data/jsonb_p0/test_jsonb_cast.out
@@ -0,0 +1,27 @@
+-- This file is automatically generated. You should know what you did if you 
want to edit this
+-- !select_1 --
+1      \N
+2      ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"]
+3      ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"]
+4      ["\\/some\\/cool\\/url", "/some/cool/url", 
"a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"]
+
+-- !select_2 --
+1      \N
+2      ["{\\'x\\' : \\'{"y" : 1}\\', \\'t\\' : \\'{"y" : 2}\\'}", "{"x" : 1}"]
+3      ["foo\\'bar', 'foo"bar', 'foo\\\\'bar', 'foo\\'\\'bar"]
+4      ["\\/some\\/cool\\/url", "/some/cool/url", 
"a\\\\_\\\\c\\\\l\\\\i\\\\c\\\\k\\\\h\\\\o\\\\u\\\\s\\\\e"]
+27     ["{"k1":"v1", "k2": 200}"]
+28     ["{"a.b.c":{"k1.a1":"v31", "k2": 300},"a":"niu"}"]
+29     ["\n\r", "\n\r"]
+30     ["f\r\n", "f\r\n""]
+
+-- !select_json --
+1      \N
+2      ["{\\\\'x\\\\' : \\\\'{\\"y\\" : 1}\\\\', \\\\'t\\\\' : \\\\'{\\"y\\" : 
2}\\\\'}","{\\"x\\" : 1}"]
+3      ["foo\\\\'bar', 'foo\\"bar', 'foo\\\\\\\\'bar', 'foo\\\\'\\\\'bar"]
+4      
["\\\\/some\\\\/cool\\\\/url","/some/cool/url","a\\\\\\\\_\\\\\\\\c\\\\\\\\l\\\\\\\\i\\\\\\\\c\\\\\\\\k\\\\\\\\h\\\\\\\\o\\\\\\\\u\\\\\\\\s\\\\\\\\e"]
+27     ["{\\"k1\\":\\"v1\\", \\"k2\\": 200}"]
+28     ["{\\"a.b.c\\":{\\"k1.a1\\":\\"v31\\", \\"k2\\": 
300},\\"a\\":\\"niu\\"}"]
+29     ["\\f\\n\\r","\\f\\n\\r"]
+30     ["f\\b\\r\\n","f\\b\\r\\n\\""]
+
diff --git a/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy 
b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy
new file mode 100644
index 00000000000..4d1b2aa7181
--- /dev/null
+++ b/regression-test/suites/jsonb_p0/test_jsonb_cast.groovy
@@ -0,0 +1,79 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+import org.codehaus.groovy.runtime.IOGroovyMethods
+
+suite("test_jsonb_cast", "p0") {
+
+    // define a sql table with array<text> which has some Escape Character and 
should also to cast to json
+    def testTable = "tbl_test_array_text_cast_jsonb"
+    def dataFile = "test_jsonb_cast.csv"
+
+    sql """ set experimental_enable_nereids_planner = true """
+    sql """ set enable_fallback_to_original_planner = true """
+
+    sql "DROP TABLE IF EXISTS ${testTable}"
+
+    sql """
+        CREATE TABLE IF NOT EXISTS ${testTable} (
+            id INT,
+            a ARRAY<TEXT>,
+        )
+        DUPLICATE KEY(id)
+        DISTRIBUTED BY HASH(id) BUCKETS 3
+        PROPERTIES("replication_num" = "1");
+        """
+
+    // load the jsonb data from csv file
+    streamLoad {
+        table testTable
+        
+        file dataFile // import csv file
+        time 10000 // limit inflight 10s
+        set 'strict_mode', 'true'
+
+        // if declared a check callback, the default check condition will 
ignore.
+        // So you must check all condition
+        check { result, exception, startTime, endTime ->
+            if (exception != null) {
+                throw exception
+            }
+            log.info("Stream load result: ${result}".toString())
+            def json = parseJson(result)
+            assertEquals(4, json.NumberTotalRows)
+            assertEquals(4, json.NumberLoadedRows)
+            assertTrue(json.LoadBytes > 0)
+        }
+    }
+
+    sql """ sync; """
+
+    // check result
+    qt_select_1 "SELECT * FROM ${testTable} ORDER BY id"
+
+
+    // insert into valid json rows
+    sql """INSERT INTO ${testTable} VALUES(27, ['{"k1":"v1", "k2": 200}'])"""
+    sql """INSERT INTO ${testTable} VALUES(28, ['{"a.b.c":{"k1.a1":"v31", 
"k2": 300},"a":"niu"}'])"""
+    sql """INSERT INTO ${testTable} VALUES(29, ['\f\n\r', "\f\n\r"])"""
+    sql """INSERT INTO ${testTable} VALUES(30, ["\\f\\b\\r\\n", 
'\\f\\b\\r\\n"'])"""
+
+    // check result
+    qt_select_2 "SELECT * FROM ${testTable} ORDER BY id"
+    // check cast as json
+    qt_select_json "SELECT id, cast(a as JSON) FROM ${testTable} ORDER BY id"
+}
\ No newline at end of file


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to