This is an automated email from the ASF dual-hosted git repository.

dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-3.0 by this push:
     new ac748c2fbcb branch-3.0: [Fix](Recycler) Fix recycler azure path fault 
#54291 (#54561)
ac748c2fbcb is described below

commit ac748c2fbcb2425e6b7654a1549b05e520b54865
Author: github-actions[bot] 
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Aug 13 09:49:44 2025 +0800

    branch-3.0: [Fix](Recycler) Fix recycler azure path fault #54291 (#54561)
    
    Cherry-picked from #54291
    
    Co-authored-by: abmdocrt <[email protected]>
---
 be/src/util/s3_util.cpp            |   4 ++
 cloud/src/recycler/s3_accessor.cpp |   6 +-
 cloud/test/util_test.cpp           | 121 ++++++++++++++++++++++++++++++++++++-
 common/cpp/util.cpp                |  55 +++++++++++++++++
 common/cpp/util.h                  |  48 +++++++++++++++
 5 files changed, 232 insertions(+), 2 deletions(-)

diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp
index d80e668c05c..174b6f8a5e6 100644
--- a/be/src/util/s3_util.cpp
+++ b/be/src/util/s3_util.cpp
@@ -48,6 +48,7 @@
 #include "cpp/aws_logger.h"
 #include "cpp/obj_retry_strategy.h"
 #include "cpp/sync_point.h"
+#include "cpp/util.h"
 #ifdef USE_AZURE
 #include "io/fs/azure_obj_storage_client.h"
 #endif
@@ -242,6 +243,9 @@ std::shared_ptr<io::ObjStorageClient> 
S3ClientFactory::_create_azure_client(
     options.Retry.MaxRetries = config::max_s3_client_retry;
     
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
 
+    std::string normalized_uri = normalize_http_uri(uri);
+    VLOG_DEBUG << "uri:" << uri << ", normalized_uri:" << normalized_uri;
+
     auto containerClient = 
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
             uri, cred, std::move(options));
     LOG_INFO("create one azure client with {}", s3_conf.to_string());
diff --git a/cloud/src/recycler/s3_accessor.cpp 
b/cloud/src/recycler/s3_accessor.cpp
index 464beb58e2e..3de8edfd048 100644
--- a/cloud/src/recycler/s3_accessor.cpp
+++ b/cloud/src/recycler/s3_accessor.cpp
@@ -28,6 +28,7 @@
 #include <gen_cpp/cloud.pb.h>
 
 #include <algorithm>
+
 #ifdef USE_AZURE
 #include <azure/core/diagnostics/logger.hpp>
 #include <azure/storage/blobs/blob_container_client.hpp>
@@ -47,6 +48,7 @@
 #include "cpp/obj_retry_strategy.h"
 #include "cpp/s3_rate_limiter.h"
 #include "cpp/sync_point.h"
+#include "cpp/util.h"
 #ifdef USE_AZURE
 #include "recycler/azure_obj_client.h"
 #endif
@@ -335,6 +337,7 @@ int S3Accessor::init() {
                 uri_ = "https://"; + uri_;
             }
         }
+        uri_ = normalize_http_uri(uri_);
         // In Azure's HTTP requests, all policies in the vector are called in 
a chained manner following the HTTP pipeline approach.
         // Within the RetryPolicy, the nextPolicy is called multiple times 
inside a loop.
         // All policies in the PerRetryPolicies are downstream of the 
RetryPolicy.
@@ -343,7 +346,7 @@ int S3Accessor::init() {
         auto container_client = 
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
                 uri_, cred, std::move(options));
         // uri format for debug: 
${scheme}://${ak}.blob.core.windows.net/${bucket}/${prefix}
-        uri_ = uri_ + '/' + conf_.prefix;
+        uri_ = normalize_http_uri(uri_ + '/' + conf_.prefix);
         obj_client_ = 
std::make_shared<AzureObjClient>(std::move(container_client));
         return 0;
 #else
@@ -357,6 +360,7 @@ int S3Accessor::init() {
         } else {
             uri_ = conf_.endpoint + '/' + conf_.bucket + '/' + conf_.prefix;
         }
+        uri_ = normalize_http_uri(uri_);
 
         // S3Conf::S3
         Aws::Client::ClientConfiguration aws_config;
diff --git a/cloud/test/util_test.cpp b/cloud/test/util_test.cpp
index e0cd54acc8b..d8adfe6269b 100644
--- a/cloud/test/util_test.cpp
+++ b/cloud/test/util_test.cpp
@@ -15,7 +15,7 @@
 // specific language governing permissions and limitations
 // under the License.
 
-#include "recycler/util.h"
+#include "cpp/util.h"
 
 #include <chrono>
 #include <stdexcept>
@@ -331,4 +331,123 @@ TEST(UtilTest, test_split) {
     auto path = 
doris::get_valid_ca_cert_path(doris::cloud::split(config::ca_cert_file_paths, 
';'));
     LOG(INFO) << "config:" << config::ca_cert_file_paths << " path:" << path;
     ASSERT_FALSE(path.empty());
+}
+
+TEST(UtilTest, test_normalize_http_uri) {
+    // ===== Basic functionality with HTTPS protocol =====
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com/path";), 
"https://example.com/path";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com//path";), 
"https://example.com/path";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com///path";), 
"https://example.com/path";);
+
+    // ===== Basic functionality with HTTP protocol =====
+    EXPECT_EQ(doris::normalize_http_uri("http://example.com/path";), 
"http://example.com/path";);
+    EXPECT_EQ(doris::normalize_http_uri("http://example.com//path";), 
"http://example.com/path";);
+    EXPECT_EQ(doris::normalize_http_uri("http://example.com///path";), 
"http://example.com/path";);
+
+    // ===== Multiple consecutive slashes in different positions =====
+    EXPECT_EQ(doris::normalize_http_uri("https://host.com//bucket//prefix";),
+              "https://host.com/bucket/prefix";);
+    
EXPECT_EQ(doris::normalize_http_uri("https://host.com///bucket///prefix///";),
+              "https://host.com/bucket/prefix/";);
+    
EXPECT_EQ(doris::normalize_http_uri("https://host.com////bucket////prefix////file";),
+              "https://host.com/bucket/prefix/file";);
+
+    // ===== Azure blob storage specific URLs =====
+    
EXPECT_EQ(doris::normalize_http_uri("https://account.blob.core.windows.net//container";),
+              "https://account.blob.core.windows.net/container";);
+    EXPECT_EQ(
+            
doris::normalize_http_uri("https://account.blob.core.windows.net///container//prefix";),
+            "https://account.blob.core.windows.net/container/prefix";);
+    EXPECT_EQ(doris::normalize_http_uri(
+                      
"https://account.blob.core.windows.net////container///prefix///file.txt";),
+              
"https://account.blob.core.windows.net/container/prefix/file.txt";);
+
+    // ===== URLs without protocol =====
+    EXPECT_EQ(doris::normalize_http_uri("example.com//path"), 
"example.com/path");
+    EXPECT_EQ(doris::normalize_http_uri("host.com///bucket//prefix"), 
"host.com/bucket/prefix");
+    EXPECT_EQ(doris::normalize_http_uri("//path//to//file"), "/path/to/file");
+
+    // ===== Edge cases =====
+    // Empty string
+    EXPECT_EQ(doris::normalize_http_uri(""), "");
+
+    // Only protocol
+    EXPECT_EQ(doris::normalize_http_uri("https://";), "https://";);
+    EXPECT_EQ(doris::normalize_http_uri("http://";), "http://";);
+
+    // Only slashes
+    EXPECT_EQ(doris::normalize_http_uri("//"), "/");
+    EXPECT_EQ(doris::normalize_http_uri("///"), "/");
+    EXPECT_EQ(doris::normalize_http_uri("////"), "/");
+
+    // Single character paths
+    EXPECT_EQ(doris::normalize_http_uri("https://a";), "https://a";);
+    EXPECT_EQ(doris::normalize_http_uri("https://a/";), "https://a/";);
+    EXPECT_EQ(doris::normalize_http_uri("https://a//";), "https://a/";);
+
+    // ===== Protocol preservation =====
+    // Ensure protocol :// is never modified
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com";), 
"https://example.com";);
+    EXPECT_EQ(doris::normalize_http_uri("http://example.com";), 
"http://example.com";);
+
+    // Even with extra slashes after protocol
+    EXPECT_EQ(doris::normalize_http_uri("https:///example.com";), 
"https://example.com";);
+    EXPECT_EQ(doris::normalize_http_uri("http:///example.com";), 
"http://example.com";);
+
+    // Mixed case protocol (though unusual)
+    EXPECT_EQ(doris::normalize_http_uri("HTTP://example.com//path"), 
"HTTP://example.com/path");
+    EXPECT_EQ(doris::normalize_http_uri("HTTPS://example.com//path"), 
"HTTPS://example.com/path");
+
+    // ===== Trailing slashes =====
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com/path/";), 
"https://example.com/path/";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com/path//";), 
"https://example.com/path/";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com/path///";),
+              "https://example.com/path/";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com/path////";),
+              "https://example.com/path/";);
+
+    // ===== Complex real-world scenarios =====
+    // Simulating common configuration mistakes
+    
EXPECT_EQ(doris::normalize_http_uri("https://endpoint.com///bucket//prefix//file.txt";),
+              "https://endpoint.com/bucket/prefix/file.txt";);
+
+    // User configured endpoint with trailing slash + bucket with leading slash
+    EXPECT_EQ(doris::normalize_http_uri("https://endpoint.com///bucket";),
+              "https://endpoint.com/bucket";);
+
+    // Multiple slashes everywhere
+    EXPECT_EQ(
+            
doris::normalize_http_uri("https://host.com////bucket////prefix////subfolder////file";),
+            "https://host.com/bucket/prefix/subfolder/file";);
+
+    // ===== Special characters in path =====
+    EXPECT_EQ(
+            
doris::normalize_http_uri("https://example.com//path-with-dash//file_with_underscore";),
+            "https://example.com/path-with-dash/file_with_underscore";);
+    
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path.with.dots//file@special";),
+              "https://example.com/path.with.dots/file@special";);
+    
EXPECT_EQ(doris::normalize_http_uri("https://example.com//bucket123//prefix456//file789";),
+              "https://example.com/bucket123/prefix456/file789";);
+
+    // ===== URLs with query parameters and fragments =====
+    
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path?query=value";),
+              "https://example.com/path?query=value";);
+    EXPECT_EQ(doris::normalize_http_uri("https://example.com//path#fragment";),
+              "https://example.com/path#fragment";);
+    
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path?query=value#fragment";),
+              "https://example.com/path?query=value#fragment";);
+}
+
+TEST(UtilTest, test_long_normalize_http_uri) {
+    std::string longPath = "https://example.com";;
+    for (int i = 0; i < 100; i++) {
+        longPath += "//segment" + std::to_string(i);
+    }
+
+    std::string expected = "https://example.com";;
+    for (int i = 0; i < 100; i++) {
+        expected += "/segment" + std::to_string(i);
+    }
+
+    EXPECT_EQ(doris::normalize_http_uri(longPath), expected);
 }
\ No newline at end of file
diff --git a/common/cpp/util.cpp b/common/cpp/util.cpp
new file mode 100644
index 00000000000..5dae6ae70ce
--- /dev/null
+++ b/common/cpp/util.cpp
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Most code of this file is copied from rocksdb SyncPoint.
+// https://github.com/facebook/rocksdb
+
+#include <string>
+
+namespace doris {
+
+std::string normalize_http_uri(const std::string& uri) {
+    if (uri.empty()) {
+        return uri;
+    }
+
+    // Find the end of protocol part (http:// or https://)
+    // Example: in "https://example.com";, protocol_end will be 8 (position 
after "://")
+    size_t protocol_end = uri.find("://");
+    if (protocol_end == std::string::npos) {
+        protocol_end = 0; // No protocol found, start from beginning
+    } else {
+        protocol_end += 3; // Skip past "://"
+    }
+
+    // Keep protocol part (e.g., "https://";)
+    std::string result = uri.substr(0, protocol_end);
+
+    // Process the rest of URI to remove duplicate slashes
+    // Example: "//path//to///file" becomes "/path/to/file"
+    for (size_t i = protocol_end; i < uri.length(); i++) {
+        char current = uri[i];
+
+        // Add current character if it's not a slash, or if it's the first 
slash in sequence
+        // This prevents consecutive slashes like "//" or "///" from being 
added
+        if (current != '/' || result.empty() || result.back() != '/') {
+            result += current;
+        }
+    }
+    return result;
+}
+} // namespace doris
diff --git a/common/cpp/util.h b/common/cpp/util.h
new file mode 100644
index 00000000000..2dc301bb0c4
--- /dev/null
+++ b/common/cpp/util.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+namespace doris {
+    /**
+    * Normalizes HTTP URI by removing duplicate slashes while preserving the 
protocol part.
+    * 
+    * This function removes consecutive forward slashes from URIs while 
keeping the protocol 
+    * section (http:// or https://) intact. It processes everything after the 
protocol to 
+    * ensure clean URI formatting.
+    *
+    * @param uri The input URI string to be normalized
+    * @return A normalized URI string with duplicate slashes removed, or the 
original 
+    *         string if it's empty
+    *
+    * @example
+    *   normalize_http_uri("https://example.com//path//to///file";) 
+    *   returns "https://example.com/path/to/file";
+    *
+    *   normalize_http_uri("http://host.com///bucket//prefix/";) 
+    *   returns "http://host.com/bucket/prefix/";
+    *
+    *   normalize_http_uri("endpoint.com//bucket///prefix") 
+    *   returns "endpoint.com/bucket/prefix"
+    *
+    *   
normalize_http_uri("https://account.blob.core.windows.net////container";) 
+    *   returns "https://account.blob.core.windows.net/container";
+    */
+    std::string normalize_http_uri(const std::string& uri);
+} // namespace doris


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to