This is an automated email from the ASF dual-hosted git repository. airborne pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new fb17f204d7a [fix](http) fix http url with incorrect character notation (#38420) (#39535) fb17f204d7a is described below commit fb17f204d7ae5e41d410e252e3e401bc4619e0b9 Author: Sun Chenyang <csun5...@gmail.com> AuthorDate: Mon Aug 19 15:03:19 2024 +0800 [fix](http) fix http url with incorrect character notation (#38420) (#39535) ## Proposed changes pick from master #38420 --- be/src/http/http_client.cpp | 60 ++++++++++++++++++- be/src/http/http_client.h | 9 +++ be/src/olap/single_replica_compaction.cpp | 15 +---- be/src/olap/task/engine_clone_task.cpp | 20 +------ be/test/http/http_client_test.cpp | 42 ++++++++++++++ .../suites/load_p2/test_single_replica_load.groovy | 67 ++++++++++++++++++++++ 6 files changed, 179 insertions(+), 34 deletions(-) diff --git a/be/src/http/http_client.cpp b/be/src/http/http_client.cpp index 218802878bd..e94614788f5 100644 --- a/be/src/http/http_client.cpp +++ b/be/src/http/http_client.cpp @@ -131,8 +131,11 @@ Status HttpClient::init(const std::string& url, bool set_fail_on_error) { LOG(WARNING) << "fail to set CURLOPT_WRITEDATA, msg=" << _to_errmsg(code); return Status::InternalError("fail to set CURLOPT_WRITEDATA"); } + + std::string escaped_url; + RETURN_IF_ERROR(_escape_url(url, &escaped_url)); // set url - code = curl_easy_setopt(_curl, CURLOPT_URL, url.c_str()); + code = curl_easy_setopt(_curl, CURLOPT_URL, escaped_url.c_str()); if (code != CURLE_OK) { LOG(WARNING) << "failed to set CURLOPT_URL, errmsg=" << _to_errmsg(code); return Status::InternalError("fail to set CURLOPT_URL"); @@ -290,4 +293,59 @@ Status HttpClient::execute_with_retry(int retry_times, int sleep_time, return status; } +// http://example.com/page?param1=value1¶m2=value+with+spaces#section +Status HttpClient::_escape_url(const std::string& url, std::string* escaped_url) { + size_t query_pos = url.find('?'); + if (query_pos == std::string::npos) { + *escaped_url = url; + return Status::OK(); + } + size_t fragment_pos = url.find('#'); + std::string query; + std::string fragment; + + if (fragment_pos == std::string::npos) { + query = url.substr(query_pos + 1, url.length() - query_pos - 1); + } else { + query = url.substr(query_pos + 1, fragment_pos - query_pos - 1); + fragment = url.substr(fragment_pos, url.length() - fragment_pos); + } + + std::string encoded_query; + size_t ampersand_pos = query.find('&'); + size_t equal_pos; + + if (ampersand_pos == std::string::npos) { + ampersand_pos = query.length(); + } + + while (true) { + equal_pos = query.find('='); + if (equal_pos != std::string::npos) { + std::string key = query.substr(0, equal_pos); + std::string value = query.substr(equal_pos + 1, ampersand_pos - equal_pos - 1); + + auto encoded_value = std::unique_ptr<char, decltype(&curl_free)>( + curl_easy_escape(_curl, value.c_str(), value.length()), &curl_free); + if (encoded_value) { + encoded_query += key + "=" + std::string(encoded_value.get()); + } else { + return Status::InternalError("escape url failed, url={}", url); + } + } else { + encoded_query += query.substr(0, ampersand_pos); + } + + if (ampersand_pos == query.length() || ampersand_pos == std::string::npos) { + break; + } + + encoded_query += "&"; + query = query.substr(ampersand_pos + 1); + ampersand_pos = query.find('&'); + } + *escaped_url = url.substr(0, query_pos + 1) + encoded_query + fragment; + return Status::OK(); +} + } // namespace doris diff --git a/be/src/http/http_client.h b/be/src/http/http_client.h index 9659de13cfc..f6a1a17ec29 100644 --- a/be/src/http/http_client.h +++ b/be/src/http/http_client.h @@ -146,6 +146,15 @@ public: size_t on_response_data(const void* data, size_t length); + // The file name of the variant column with the inverted index contains % + // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx + // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx + // We need to handle %, otherwise it will cause an HTTP 404 error. + // Because the percent ("%") character serves as the indicator for percent-encoded octets, + // it must be percent-encoded as "%25" for that octet to be used as data within a URI. + // https://datatracker.ietf.org/doc/html/rfc3986 + Status _escape_url(const std::string& url, std::string* escaped_url); + private: const char* _to_errmsg(CURLcode code); diff --git a/be/src/olap/single_replica_compaction.cpp b/be/src/olap/single_replica_compaction.cpp index 7381f5d3c69..393bfb99f7b 100644 --- a/be/src/olap/single_replica_compaction.cpp +++ b/be/src/olap/single_replica_compaction.cpp @@ -411,20 +411,7 @@ Status SingleReplicaCompaction::_download_files(DataDir* data_dir, return Status::InternalError("single compaction init curl failed"); } for (auto& file_name : file_name_list) { - // The file name of the variant column with the inverted index contains % - // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx - // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx - // We need to handle %, otherwise it will cause an HTTP 404 error. - // Because the percent ("%") character serves as the indicator for percent-encoded octets, - // it must be percent-encoded as "%25" for that octet to be used as data within a URI. - // https://datatracker.ietf.org/doc/html/rfc3986 - auto output = std::unique_ptr<char, decltype(&curl_free)>( - curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free); - if (!output) { - return Status::InternalError("escape file name failed, file name={}", file_name); - } - std::string encoded_filename(output.get()); - auto remote_file_url = remote_url_prefix + encoded_filename; + auto remote_file_url = remote_url_prefix + file_name; // get file length uint64_t file_size = 0; diff --git a/be/src/olap/task/engine_clone_task.cpp b/be/src/olap/task/engine_clone_task.cpp index 300b65527c1..3a780c5bf38 100644 --- a/be/src/olap/task/engine_clone_task.cpp +++ b/be/src/olap/task/engine_clone_task.cpp @@ -525,26 +525,8 @@ Status EngineCloneTask::_download_files(DataDir* data_dir, const std::string& re uint64_t total_file_size = 0; MonotonicStopWatch watch; watch.start(); - auto curl = std::unique_ptr<CURL, decltype(&curl_easy_cleanup)>(curl_easy_init(), - &curl_easy_cleanup); - if (!curl) { - return Status::InternalError("engine clone task init curl failed"); - } for (auto& file_name : file_name_list) { - // The file name of the variant column with the inverted index contains % - // such as: 020000000000003f624c4c322c568271060f9b5b274a4a95_0_10133@properties%2Emessage.idx - // {rowset_id}_{seg_num}_{index_id}_{variant_column_name}{%2E}{extracted_column_name}.idx - // We need to handle %, otherwise it will cause an HTTP 404 error. - // Because the percent ("%") character serves as the indicator for percent-encoded octets, - // it must be percent-encoded as "%25" for that octet to be used as data within a URI. - // https://datatracker.ietf.org/doc/html/rfc3986 - auto output = std::unique_ptr<char, decltype(&curl_free)>( - curl_easy_escape(curl.get(), file_name.c_str(), file_name.length()), &curl_free); - if (!output) { - return Status::InternalError("escape file name failed, file name={}", file_name); - } - std::string encoded_filename(output.get()); - auto remote_file_url = remote_url_prefix + encoded_filename; + auto remote_file_url = remote_url_prefix + file_name; // get file length uint64_t file_size = 0; diff --git a/be/test/http/http_client_test.cpp b/be/test/http/http_client_test.cpp index c157f1a13c0..00b3288d2e9 100644 --- a/be/test/http/http_client_test.cpp +++ b/be/test/http/http_client_test.cpp @@ -299,4 +299,46 @@ TEST_F(HttpClientTest, download_file_md5) { close(fd); } +TEST_F(HttpClientTest, escape_url) { + HttpClient client; + client._curl = curl_easy_init(); + auto check_result = [&client](const auto& input_url, const auto& output_url) -> bool { + std::string escaped_url; + if (!client._escape_url(input_url, &escaped_url).ok()) { + return false; + } + if (escaped_url != output_url) { + return false; + } + return true; + }; + std::string input_A = hostname + "/download_file?token=oxof&file_name=02x_0.dat"; + std::string output_A = hostname + "/download_file?token=oxof&file_name=02x_0.dat"; + ASSERT_TRUE(check_result(input_A, output_A)); + + std::string input_B = hostname + "/download_file?"; + std::string output_B = hostname + "/download_file?"; + ASSERT_TRUE(check_result(input_B, output_B)); + + std::string input_C = hostname + "/download_file"; + std::string output_C = hostname + "/download_file"; + ASSERT_TRUE(check_result(input_C, output_C)); + + std::string input_D = hostname + "/download_file?&"; + std::string output_D = hostname + "/download_file?&"; + ASSERT_TRUE(check_result(input_D, output_D)); + + std::string input_E = hostname + "/download_file?key=0x2E"; + std::string output_E = hostname + "/download_file?key=0x2E"; + ASSERT_TRUE(check_result(input_E, output_E)); + + std::string input_F = hostname + "/download_file?key=0x2E&key=%"; + std::string output_F = hostname + "/download_file?key=0x2E&key=%25"; + ASSERT_TRUE(check_result(input_F, output_F)); + + std::string input_G = hostname + "/download_file?key=0x2E&key=%2E#section"; + std::string output_G = hostname + "/download_file?key=0x2E&key=%252E#section"; + ASSERT_TRUE(check_result(input_G, output_G)); +} + } // namespace doris diff --git a/regression-test/suites/load_p2/test_single_replica_load.groovy b/regression-test/suites/load_p2/test_single_replica_load.groovy new file mode 100644 index 00000000000..c8ea2c578eb --- /dev/null +++ b/regression-test/suites/load_p2/test_single_replica_load.groovy @@ -0,0 +1,67 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// The cases is copied from https://github.com/trinodb/trino/tree/master +// /testing/trino-product-tests/src/main/resources/sql-tests/testcases +// and modified by Doris. + +suite("test_single_replica_load", "p2") { + + def load_json_data = {table_name, file_name -> + // load the json data + streamLoad { + table "${table_name}" + + // set http request header params + set 'read_json_by_line', 'true' + set 'format', 'json' + set 'max_filter_ratio', '0.1' + file file_name // import json file + time 10000 // limit inflight 10s + + // if declared a check callback, the default check condition will ignore. + // So you must check all condition + + check { result, exception, startTime, endTime -> + if (exception != null) { + throw exception + } + logger.info("Stream load ${file_name} result: ${result}".toString()) + def json = parseJson(result) + assertEquals("success", json.Status.toLowerCase()) + assertTrue(json.NumberLoadedRows > 0 && json.LoadBytes > 0) + } + } + } + + def tableName = "test_single_replica_load" + + sql "DROP TABLE IF EXISTS ${tableName}" + sql """ + CREATE TABLE IF NOT EXISTS ${tableName} ( + k bigint, + v variant, + INDEX idx(v) USING INVERTED PROPERTIES("parser"="standard") COMMENT '' + ) + DUPLICATE KEY(`k`) + DISTRIBUTED BY HASH(k) BUCKETS 1 + properties("replication_num" = "2", "disable_auto_compaction" = "true", "inverted_index_storage_format" = "V1"); + """ + load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""") + load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""") + load_json_data.call(tableName, """${getS3Url() + '/regression/gharchive.m/2015-01-01-0.json'}""") +} --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org