This is an automated email from the ASF dual-hosted git repository.
dataroaring pushed a commit to branch branch-3.0
in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-3.0 by this push:
new ac748c2fbcb branch-3.0: [Fix](Recycler) Fix recycler azure path fault
#54291 (#54561)
ac748c2fbcb is described below
commit ac748c2fbcb2425e6b7654a1549b05e520b54865
Author: github-actions[bot]
<41898282+github-actions[bot]@users.noreply.github.com>
AuthorDate: Wed Aug 13 09:49:44 2025 +0800
branch-3.0: [Fix](Recycler) Fix recycler azure path fault #54291 (#54561)
Cherry-picked from #54291
Co-authored-by: abmdocrt <[email protected]>
---
be/src/util/s3_util.cpp | 4 ++
cloud/src/recycler/s3_accessor.cpp | 6 +-
cloud/test/util_test.cpp | 121 ++++++++++++++++++++++++++++++++++++-
common/cpp/util.cpp | 55 +++++++++++++++++
common/cpp/util.h | 48 +++++++++++++++
5 files changed, 232 insertions(+), 2 deletions(-)
diff --git a/be/src/util/s3_util.cpp b/be/src/util/s3_util.cpp
index d80e668c05c..174b6f8a5e6 100644
--- a/be/src/util/s3_util.cpp
+++ b/be/src/util/s3_util.cpp
@@ -48,6 +48,7 @@
#include "cpp/aws_logger.h"
#include "cpp/obj_retry_strategy.h"
#include "cpp/sync_point.h"
+#include "cpp/util.h"
#ifdef USE_AZURE
#include "io/fs/azure_obj_storage_client.h"
#endif
@@ -242,6 +243,9 @@ std::shared_ptr<io::ObjStorageClient>
S3ClientFactory::_create_azure_client(
options.Retry.MaxRetries = config::max_s3_client_retry;
options.PerRetryPolicies.emplace_back(std::make_unique<AzureRetryRecordPolicy>());
+ std::string normalized_uri = normalize_http_uri(uri);
+ VLOG_DEBUG << "uri:" << uri << ", normalized_uri:" << normalized_uri;
+
auto containerClient =
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
uri, cred, std::move(options));
LOG_INFO("create one azure client with {}", s3_conf.to_string());
diff --git a/cloud/src/recycler/s3_accessor.cpp
b/cloud/src/recycler/s3_accessor.cpp
index 464beb58e2e..3de8edfd048 100644
--- a/cloud/src/recycler/s3_accessor.cpp
+++ b/cloud/src/recycler/s3_accessor.cpp
@@ -28,6 +28,7 @@
#include <gen_cpp/cloud.pb.h>
#include <algorithm>
+
#ifdef USE_AZURE
#include <azure/core/diagnostics/logger.hpp>
#include <azure/storage/blobs/blob_container_client.hpp>
@@ -47,6 +48,7 @@
#include "cpp/obj_retry_strategy.h"
#include "cpp/s3_rate_limiter.h"
#include "cpp/sync_point.h"
+#include "cpp/util.h"
#ifdef USE_AZURE
#include "recycler/azure_obj_client.h"
#endif
@@ -335,6 +337,7 @@ int S3Accessor::init() {
uri_ = "https://" + uri_;
}
}
+ uri_ = normalize_http_uri(uri_);
// In Azure's HTTP requests, all policies in the vector are called in
a chained manner following the HTTP pipeline approach.
// Within the RetryPolicy, the nextPolicy is called multiple times
inside a loop.
// All policies in the PerRetryPolicies are downstream of the
RetryPolicy.
@@ -343,7 +346,7 @@ int S3Accessor::init() {
auto container_client =
std::make_shared<Azure::Storage::Blobs::BlobContainerClient>(
uri_, cred, std::move(options));
// uri format for debug:
${scheme}://${ak}.blob.core.windows.net/${bucket}/${prefix}
- uri_ = uri_ + '/' + conf_.prefix;
+ uri_ = normalize_http_uri(uri_ + '/' + conf_.prefix);
obj_client_ =
std::make_shared<AzureObjClient>(std::move(container_client));
return 0;
#else
@@ -357,6 +360,7 @@ int S3Accessor::init() {
} else {
uri_ = conf_.endpoint + '/' + conf_.bucket + '/' + conf_.prefix;
}
+ uri_ = normalize_http_uri(uri_);
// S3Conf::S3
Aws::Client::ClientConfiguration aws_config;
diff --git a/cloud/test/util_test.cpp b/cloud/test/util_test.cpp
index e0cd54acc8b..d8adfe6269b 100644
--- a/cloud/test/util_test.cpp
+++ b/cloud/test/util_test.cpp
@@ -15,7 +15,7 @@
// specific language governing permissions and limitations
// under the License.
-#include "recycler/util.h"
+#include "cpp/util.h"
#include <chrono>
#include <stdexcept>
@@ -331,4 +331,123 @@ TEST(UtilTest, test_split) {
auto path =
doris::get_valid_ca_cert_path(doris::cloud::split(config::ca_cert_file_paths,
';'));
LOG(INFO) << "config:" << config::ca_cert_file_paths << " path:" << path;
ASSERT_FALSE(path.empty());
+}
+
+TEST(UtilTest, test_normalize_http_uri) {
+ // ===== Basic functionality with HTTPS protocol =====
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com/path"),
"https://example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com//path"),
"https://example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com///path"),
"https://example.com/path");
+
+ // ===== Basic functionality with HTTP protocol =====
+ EXPECT_EQ(doris::normalize_http_uri("http://example.com/path"),
"http://example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("http://example.com//path"),
"http://example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("http://example.com///path"),
"http://example.com/path");
+
+ // ===== Multiple consecutive slashes in different positions =====
+ EXPECT_EQ(doris::normalize_http_uri("https://host.com//bucket//prefix"),
+ "https://host.com/bucket/prefix");
+
EXPECT_EQ(doris::normalize_http_uri("https://host.com///bucket///prefix///"),
+ "https://host.com/bucket/prefix/");
+
EXPECT_EQ(doris::normalize_http_uri("https://host.com////bucket////prefix////file"),
+ "https://host.com/bucket/prefix/file");
+
+ // ===== Azure blob storage specific URLs =====
+
EXPECT_EQ(doris::normalize_http_uri("https://account.blob.core.windows.net//container"),
+ "https://account.blob.core.windows.net/container");
+ EXPECT_EQ(
+
doris::normalize_http_uri("https://account.blob.core.windows.net///container//prefix"),
+ "https://account.blob.core.windows.net/container/prefix");
+ EXPECT_EQ(doris::normalize_http_uri(
+
"https://account.blob.core.windows.net////container///prefix///file.txt"),
+
"https://account.blob.core.windows.net/container/prefix/file.txt");
+
+ // ===== URLs without protocol =====
+ EXPECT_EQ(doris::normalize_http_uri("example.com//path"),
"example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("host.com///bucket//prefix"),
"host.com/bucket/prefix");
+ EXPECT_EQ(doris::normalize_http_uri("//path//to//file"), "/path/to/file");
+
+ // ===== Edge cases =====
+ // Empty string
+ EXPECT_EQ(doris::normalize_http_uri(""), "");
+
+ // Only protocol
+ EXPECT_EQ(doris::normalize_http_uri("https://"), "https://");
+ EXPECT_EQ(doris::normalize_http_uri("http://"), "http://");
+
+ // Only slashes
+ EXPECT_EQ(doris::normalize_http_uri("//"), "/");
+ EXPECT_EQ(doris::normalize_http_uri("///"), "/");
+ EXPECT_EQ(doris::normalize_http_uri("////"), "/");
+
+ // Single character paths
+ EXPECT_EQ(doris::normalize_http_uri("https://a"), "https://a");
+ EXPECT_EQ(doris::normalize_http_uri("https://a/"), "https://a/");
+ EXPECT_EQ(doris::normalize_http_uri("https://a//"), "https://a/");
+
+ // ===== Protocol preservation =====
+ // Ensure protocol :// is never modified
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com"),
"https://example.com");
+ EXPECT_EQ(doris::normalize_http_uri("http://example.com"),
"http://example.com");
+
+ // Even with extra slashes after protocol
+ EXPECT_EQ(doris::normalize_http_uri("https:///example.com"),
"https://example.com");
+ EXPECT_EQ(doris::normalize_http_uri("http:///example.com"),
"http://example.com");
+
+ // Mixed case protocol (though unusual)
+ EXPECT_EQ(doris::normalize_http_uri("HTTP://example.com//path"),
"HTTP://example.com/path");
+ EXPECT_EQ(doris::normalize_http_uri("HTTPS://example.com//path"),
"HTTPS://example.com/path");
+
+ // ===== Trailing slashes =====
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com/path/"),
"https://example.com/path/");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com/path//"),
"https://example.com/path/");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com/path///"),
+ "https://example.com/path/");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com/path////"),
+ "https://example.com/path/");
+
+ // ===== Complex real-world scenarios =====
+ // Simulating common configuration mistakes
+
EXPECT_EQ(doris::normalize_http_uri("https://endpoint.com///bucket//prefix//file.txt"),
+ "https://endpoint.com/bucket/prefix/file.txt");
+
+ // User configured endpoint with trailing slash + bucket with leading slash
+ EXPECT_EQ(doris::normalize_http_uri("https://endpoint.com///bucket"),
+ "https://endpoint.com/bucket");
+
+ // Multiple slashes everywhere
+ EXPECT_EQ(
+
doris::normalize_http_uri("https://host.com////bucket////prefix////subfolder////file"),
+ "https://host.com/bucket/prefix/subfolder/file");
+
+ // ===== Special characters in path =====
+ EXPECT_EQ(
+
doris::normalize_http_uri("https://example.com//path-with-dash//file_with_underscore"),
+ "https://example.com/path-with-dash/file_with_underscore");
+
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path.with.dots//file@special"),
+ "https://example.com/path.with.dots/file@special");
+
EXPECT_EQ(doris::normalize_http_uri("https://example.com//bucket123//prefix456//file789"),
+ "https://example.com/bucket123/prefix456/file789");
+
+ // ===== URLs with query parameters and fragments =====
+
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path?query=value"),
+ "https://example.com/path?query=value");
+ EXPECT_EQ(doris::normalize_http_uri("https://example.com//path#fragment"),
+ "https://example.com/path#fragment");
+
EXPECT_EQ(doris::normalize_http_uri("https://example.com//path?query=value#fragment"),
+ "https://example.com/path?query=value#fragment");
+}
+
+TEST(UtilTest, test_long_normalize_http_uri) {
+ std::string longPath = "https://example.com";
+ for (int i = 0; i < 100; i++) {
+ longPath += "//segment" + std::to_string(i);
+ }
+
+ std::string expected = "https://example.com";
+ for (int i = 0; i < 100; i++) {
+ expected += "/segment" + std::to_string(i);
+ }
+
+ EXPECT_EQ(doris::normalize_http_uri(longPath), expected);
}
\ No newline at end of file
diff --git a/common/cpp/util.cpp b/common/cpp/util.cpp
new file mode 100644
index 00000000000..5dae6ae70ce
--- /dev/null
+++ b/common/cpp/util.cpp
@@ -0,0 +1,55 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+// Most code of this file is copied from rocksdb SyncPoint.
+// https://github.com/facebook/rocksdb
+
+#include <string>
+
+namespace doris {
+
+std::string normalize_http_uri(const std::string& uri) {
+ if (uri.empty()) {
+ return uri;
+ }
+
+ // Find the end of protocol part (http:// or https://)
+ // Example: in "https://example.com", protocol_end will be 8 (position
after "://")
+ size_t protocol_end = uri.find("://");
+ if (protocol_end == std::string::npos) {
+ protocol_end = 0; // No protocol found, start from beginning
+ } else {
+ protocol_end += 3; // Skip past "://"
+ }
+
+ // Keep protocol part (e.g., "https://")
+ std::string result = uri.substr(0, protocol_end);
+
+ // Process the rest of URI to remove duplicate slashes
+ // Example: "//path//to///file" becomes "/path/to/file"
+ for (size_t i = protocol_end; i < uri.length(); i++) {
+ char current = uri[i];
+
+ // Add current character if it's not a slash, or if it's the first
slash in sequence
+ // This prevents consecutive slashes like "//" or "///" from being
added
+ if (current != '/' || result.empty() || result.back() != '/') {
+ result += current;
+ }
+ }
+ return result;
+}
+} // namespace doris
diff --git a/common/cpp/util.h b/common/cpp/util.h
new file mode 100644
index 00000000000..2dc301bb0c4
--- /dev/null
+++ b/common/cpp/util.h
@@ -0,0 +1,48 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements. See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership. The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License. You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied. See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <string>
+
+namespace doris {
+ /**
+ * Normalizes HTTP URI by removing duplicate slashes while preserving the
protocol part.
+ *
+ * This function removes consecutive forward slashes from URIs while
keeping the protocol
+ * section (http:// or https://) intact. It processes everything after the
protocol to
+ * ensure clean URI formatting.
+ *
+ * @param uri The input URI string to be normalized
+ * @return A normalized URI string with duplicate slashes removed, or the
original
+ * string if it's empty
+ *
+ * @example
+ * normalize_http_uri("https://example.com//path//to///file")
+ * returns "https://example.com/path/to/file"
+ *
+ * normalize_http_uri("http://host.com///bucket//prefix/")
+ * returns "http://host.com/bucket/prefix/"
+ *
+ * normalize_http_uri("endpoint.com//bucket///prefix")
+ * returns "endpoint.com/bucket/prefix"
+ *
+ *
normalize_http_uri("https://account.blob.core.windows.net////container")
+ * returns "https://account.blob.core.windows.net/container"
+ */
+ std::string normalize_http_uri(const std::string& uri);
+} // namespace doris
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]