This is an automated email from the ASF dual-hosted git repository.

airborne pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new fb17f204d7a [fix](http) fix http url with incorrect character notation 
(#38420) (#39535)
fb17f204d7a is described below

commit fb17f204d7ae5e41d410e252e3e401bc4619e0b9
Author: Sun Chenyang <csun5...@gmail.com>
AuthorDate: Mon Aug 19 15:03:19 2024 +0800

    [fix](http) fix http url with incorrect character notation (#38420) (#39535)
    
    ## Proposed changes
    
    pick from master #38420
---
 be/src/http/http_client.cpp                        | 60 ++++++++++++++++++-
 be/src/http/http_client.h                          |  9 +++
 be/src/olap/single_replica_compaction.cpp          | 15 +----
 be/src/olap/task/engine_clone_task.cpp             | 20 +------
 be/test/http/http_client_test.cpp                  | 42 ++++++++++++++
 .../suites/load_p2/test_single_replica_load.groovy | 67 ++++++++++++++++++++++
 6 files changed, 179 insertions(+), 34 deletions(-)

diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp
index 218802878bd..e94614788f5 100644
--- a/be/src/http/http_client.cpp
+++ b/be/src/http/http_client.cpp
@@ -131,8 +131,11 @@ Status HttpClient::init(const std::string& url, bool 
set_fail_on_error) {
         LOG(WARNING) << "fail to set CURLOPT_WRITEDATA, msg=" << 
_to_errmsg(code);
         return Status::InternalError("fail to set CURLOPT_WRITEDATA");
     }
+
+    std::string escaped_url;
+    RETURN_IF_ERROR(_escape_url(url, &escaped_url));
     // set url
-    code = curl_easy_setopt(_curl, CURLOPT_URL, url.c_str());
+    code = curl_easy_setopt(_curl, CURLOPT_URL, escaped_url.c_str());
     if (code != CURLE_OK) {
         LOG(WARNING) << "failed to set CURLOPT_URL, errmsg=" << 
_to_errmsg(code);
         return Status::InternalError("fail to set CURLOPT_URL");
@@ -290,4 +293,59 @@ Status HttpClient::execute_with_retry(int retry_times, int 
sleep_time,
     return status;
 }
 
+// http://example.com/page?param1=value1&param2=value+with+spaces#section
+Status HttpClient::_escape_url(const std::string& url, std::string* 
escaped_url) {
+    size_t query_pos = url.find('?');
+    if (query_pos == std::string::npos) {
+        *escaped_url = url;
+        return Status::OK();
+    }
+    size_t fragment_pos = url.find('#');
+    std::string query;
+    std::string fragment;
+
+    if (fragment_pos == std::string::npos) {
+        query = url.substr(query_pos + 1, url.length() - query_pos - 1);
+    } else {
+        query = url.substr(query_pos + 1, fragment_pos - query_pos - 1);
+        fragment = url.substr(fragment_pos, url.length() - fragment_pos);
+    }
+
+    std::string encoded_query;
+    size_t ampersand_pos = query.find('&');
+    size_t equal_pos;
+
+    if (ampersand_pos == std::string::npos) {
+        ampersand_pos = query.length();
+    }
+
+    while (true) {
+        equal_pos = query.find('=');
+        if (equal_pos != std::string::npos) {
+            std::string key = query.substr(0, equal_pos);
+            std::string value = query.substr(equal_pos + 1, ampersand_pos - 
equal_pos - 1);
+
+            auto encoded_value = std::unique_ptr<char, decltype(&curl_free)>(
+                    curl_easy_escape(_curl, value.c_str(), value.length()), 
&curl_free);
+            if (encoded_value) {
+                encoded_query += key + "=" + std::string(encoded_value.get());
+            } else {
+                return Status::InternalError("escape url failed, url={}", url);
+            }
+        } else {
+            encoded_query += query.substr(0, ampersand_pos);
+        }
+
+        if (ampersand_pos == query.length() || ampersand_pos == 
std::string::npos) {
+            break;
+        }
+
+        encoded_query += "&";
+        query = query.substr(ampersand_pos + 1);
+        ampersand_pos = query.find('&');
+    }
+    *escaped_url = url.substr(0, query_pos + 1) + encoded_query + fragment;
+    return Status::OK();
+}
+
 } // namespace doris
diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h
index 9659de13cfc..f6a1a17ec29 100644
--- a/be/src/http/http_client.h
+++ b/be/src/http/http_client.h
@@ -146,6 +146,15 @@ public:
 
     size_t on_response_data(const void* data, size_t length);
 
+    // The file name of the variant column with the inverted index contains %
+    // such as: 
020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
+    //  
{rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
+    // We need to handle %, otherwise it will cause an HTTP 404 error.
+    // Because the percent ("%") character serves as the indicator for 
percent-encoded octets,
+    // it must be percent-encoded as "%25" for that octet to be used as data 
within a URI.
+    // https://datatracker.ietf.org/doc/html/rfc3986
+    Status _escape_url(const std::string& url, std::string* escaped_url);
+
 private:
     const char* _to_errmsg(CURLcode code);
 
diff --git a/be/src/olap/single_replica_compaction.cpp 
b/be/src/olap/single_replica_compaction.cpp
index 7381f5d3c69..393bfb99f7b 100644
--- a/be/src/olap/single_replica_compaction.cpp
+++ b/be/src/olap/single_replica_compaction.cpp
@@ -411,20 +411,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* 
data_dir,
         return Status::InternalError("single compaction init curl failed");
     }
     for (auto& file_name : file_name_list) {
-        // The file name of the variant column with the inverted index 
contains %
-        // such as: 
020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
-        //  
{rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
-        // We need to handle %, otherwise it will cause an HTTP 404 error.
-        // Because the percent ("%") character serves as the indicator for 
percent-encoded octets,
-        // it must be percent-encoded as "%25" for that octet to be used as 
data within a URI.
-        // https://datatracker.ietf.org/doc/html/rfc3986
-        auto output = std::unique_ptr<char, decltype(&curl_free)>(
-                curl_easy_escape(curl.get(), file_name.c_str(), 
file_name.length()), &curl_free);
-        if (!output) {
-            return Status::InternalError("escape file name failed, file 
name={}", file_name);
-        }
-        std::string encoded_filename(output.get());
-        auto remote_file_url = remote_url_prefix + encoded_filename;
+        auto remote_file_url = remote_url_prefix + file_name;
 
         // get file length
         uint64_t file_size = 0;
diff --git a/be/src/olap/task/engine_clone_task.cpp 
b/be/src/olap/task/engine_clone_task.cpp
index 300b65527c1..3a780c5bf38 100644
--- a/be/src/olap/task/engine_clone_task.cpp
+++ b/be/src/olap/task/engine_clone_task.cpp
@@ -525,26 +525,8 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, 
const std::string& re
     uint64_t total_file_size = 0;
     MonotonicStopWatch watch;
     watch.start();
-    auto curl = std::unique_ptr<CURL, 
decltype(&curl_easy_cleanup)>(curl_easy_init(),
-                                                                    
&curl_easy_cleanup);
-    if (!curl) {
-        return Status::InternalError("engine clone task init curl failed");
-    }
     for (auto& file_name : file_name_list) {
-        // The file name of the variant column with the inverted index 
contains %
-        // such as: 
020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx
-        //  
{rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx
-        // We need to handle %, otherwise it will cause an HTTP 404 error.
-        // Because the percent ("%") character serves as the indicator for 
percent-encoded octets,
-        // it must be percent-encoded as "%25" for that octet to be used as 
data within a URI.
-        // https://datatracker.ietf.org/doc/html/rfc3986
-        auto output = std::unique_ptr<char, decltype(&curl_free)>(
-                curl_easy_escape(curl.get(), file_name.c_str(), 
file_name.length()), &curl_free);
-        if (!output) {
-            return Status::InternalError("escape file name failed, file 
name={}", file_name);
-        }
-        std::string encoded_filename(output.get());
-        auto remote_file_url = remote_url_prefix + encoded_filename;
+        auto remote_file_url = remote_url_prefix + file_name;
 
         // get file length
         uint64_t file_size = 0;
diff --git a/be/test/http/http_client_test.cpp 
b/be/test/http/http_client_test.cpp
index c157f1a13c0..00b3288d2e9 100644
--- a/be/test/http/http_client_test.cpp
+++ b/be/test/http/http_client_test.cpp
@@ -299,4 +299,46 @@ TEST_F(HttpClientTest, download_file_md5) {
     close(fd);
 }
 
+TEST_F(HttpClientTest, escape_url) {
+    HttpClient client;
+    client._curl = curl_easy_init();
+    auto check_result = [&client](const auto& input_url, const auto& 
output_url) -> bool {
+        std::string escaped_url;
+        if (!client._escape_url(input_url, &escaped_url).ok()) {
+            return false;
+        }
+        if (escaped_url != output_url) {
+            return false;
+        }
+        return true;
+    };
+    std::string input_A = hostname + 
"/download_file?token=oxof&file_name=02x_0.dat";
+    std::string output_A = hostname + 
"/download_file?token=oxof&file_name=02x_0.dat";
+    ASSERT_TRUE(check_result(input_A, output_A));
+
+    std::string input_B = hostname + "/download_file?";
+    std::string output_B = hostname + "/download_file?";
+    ASSERT_TRUE(check_result(input_B, output_B));
+
+    std::string input_C = hostname + "/download_file";
+    std::string output_C = hostname + "/download_file";
+    ASSERT_TRUE(check_result(input_C, output_C));
+
+    std::string input_D = hostname + "/download_file?&";
+    std::string output_D = hostname + "/download_file?&";
+    ASSERT_TRUE(check_result(input_D, output_D));
+
+    std::string input_E = hostname + "/download_file?key=0x2E";
+    std::string output_E = hostname + "/download_file?key=0x2E";
+    ASSERT_TRUE(check_result(input_E, output_E));
+
+    std::string input_F = hostname + "/download_file?key=0x2E&key=%";
+    std::string output_F = hostname + "/download_file?key=0x2E&key=%25";
+    ASSERT_TRUE(check_result(input_F, output_F));
+
+    std::string input_G = hostname + "/download_file?key=0x2E&key=%2E#section";
+    std::string output_G = hostname + 
"/download_file?key=0x2E&key=%252E#section";
+    ASSERT_TRUE(check_result(input_G, output_G));
+}
+
 } // namespace doris
diff --git a/regression-test/suites/load_p2/test_single_replica_load.groovy 
b/regression-test/suites/load_p2/test_single_replica_load.groovy
new file mode 100644
index 00000000000..c8ea2c578eb
--- /dev/null
+++ b/regression-test/suites/load_p2/test_single_replica_load.groovy
@@ -0,0 +1,67 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// The cases is copied from https://github.com/trinodb/trino/tree/master
+// /testing/trino-product-tests/src/main/resources/sql-tests/testcases
+// and modified by Doris.
+
+suite("test_single_replica_load", "p2") {
+    
+    def load_json_data = {table_name, file_name ->
+        // load the json data
+        streamLoad {
+            table "${table_name}"
+
+            // set http request header params
+            set 'read_json_by_line', 'true' 
+            set 'format', 'json' 
+            set 'max_filter_ratio', '0.1'
+            file file_name // import json file
+            time 10000 // limit inflight 10s
+
+            // if declared a check callback, the default check condition will 
ignore.
+            // So you must check all condition
+
+            check { result, exception, startTime, endTime ->
+                if (exception != null) {
+                        throw exception
+                }
+                logger.info("Stream load ${file_name} result: 
${result}".toString())
+                def json = parseJson(result)
+                assertEquals("success", json.Status.toLowerCase())
+                assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0)
+            }
+        }
+    }
+
+    def tableName = "test_single_replica_load"
+
+    sql "DROP TABLE IF EXISTS ${tableName}"
+    sql """
+        CREATE TABLE IF NOT EXISTS ${tableName} (
+            k bigint,
+            v variant,
+            INDEX idx(v) USING INVERTED PROPERTIES("parser"="standard")  
COMMENT ''
+        )
+        DUPLICATE KEY(`k`)
+        DISTRIBUTED BY HASH(k) BUCKETS 1
+        properties("replication_num" = "2", "disable_auto_compaction" = 
"true", "inverted_index_storage_format" = "V1");
+    """
+    load_json_data.call(tableName, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-0.json'}""")
+    load_json_data.call(tableName, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-0.json'}""")
+    load_json_data.call(tableName, """${getS3Url() + 
'/regression/gharchive.m/2015-01-01-0.json'}""")
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to