This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new 6080ca6d04 GH-46214: [C++] Improve S3 client initialization (#46723)
6080ca6d04 is described below
commit 6080ca6d04be6f65ffa8ceefa1a9e39a0663b7b9
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jun 25 04:33:38 2025 +0200
GH-46214: [C++] Improve S3 client initialization (#46723)
### Rationale for this change
The default constructor of the `S3ClientConfiguration` class in the AWS SDK
issues spurious EC2 metadata requests, even though we later set up the
configuration values ourselves.
### What changes are included in this PR?
1. Avoid spurious EC2 metadata calls by disabling "IMDS" in the
`S3ClientConfiguration` constructor
2. Change the smart defaults from "legacy" to "standard" (see
https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html)
3. Let the user configure the smart defaults in `S3Options`
Benchmarks on my local work machine:
* on git main:
```python
>>> %time fs.S3FileSystem(anonymous=True)
CPU times: user 316 μs, sys: 1.01 ms, total: 1.33 ms
Wall time: 2 s
<pyarrow._s3fs.S3FileSystem at 0x7d2182772fb0>
>>> %time fs.S3FileSystem(access_key='key', secret_key='secret')
CPU times: user 1.43 ms, sys: 0 ns, total: 1.43 ms
Wall time: 2 s
<pyarrow._s3fs.S3FileSystem at 0x7d22602337b0>
>>> %time fs.S3FileSystem()
CPU times: user 6.72 ms, sys: 3.27 ms, total: 9.99 ms
Wall time: 12 s
<pyarrow._s3fs.S3FileSystem at 0x7d226003bc30>
```
* on this PR:
```python
>>> %time fs.S3FileSystem(anonymous=True)
CPU times: user 199 μs, sys: 0 ns, total: 199 μs
Wall time: 203 μs
<pyarrow._s3fs.S3FileSystem at 0x7d6c401c01b0>
>>> %time fs.S3FileSystem(access_key='key', secret_key='secret')
CPU times: user 198 μs, sys: 10 μs, total: 208 μs
Wall time: 212 μs
<pyarrow._s3fs.S3FileSystem at 0x7d6c4b2e2c30>
>>> %time fs.S3FileSystem()
CPU times: user 13.5 ms, sys: 3.52 ms, total: 17.1 ms
Wall time: 14 s
<pyarrow._s3fs.S3FileSystem at 0x7d6c40158df0>
```
### Are these changes tested?
By existing CI tests and configurations.
### Are there any user-facing changes?
The default S3 settings are potentially changed. Hopefully this will not
trigger any regression in behavior.
* GitHub Issue: #46214
Authored-by: Antoine Pitrou <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/filesystem/s3fs.cc | 15 +++++++++++++--
cpp/src/arrow/filesystem/s3fs.h | 6 ++++++
cpp/src/arrow/filesystem/s3fs_test.cc | 7 +++++++
3 files changed, 26 insertions(+), 2 deletions(-)
diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 62daab249c..0477417150 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -398,6 +398,8 @@ Result<S3Options> S3Options::FromUri(const Uri& uri,
std::string* out_path) {
} else if (kv.first == "tls_verify_certificates") {
ARROW_ASSIGN_OR_RAISE(options.tls_verify_certificates,
::arrow::internal::ParseBoolean(kv.second));
+ } else if (kv.first == "smart_defaults") {
+ options.smart_defaults = kv.second;
} else {
return Status::Invalid("Unexpected query parameter in S3 URI: '",
kv.first, "'");
}
@@ -424,7 +426,8 @@ bool S3Options::Equals(const S3Options& other) const {
default_metadata_size
? (other.default_metadata &&
other.default_metadata->Equals(*default_metadata))
: (!other.default_metadata || other.default_metadata->size() == 0);
- return (region == other.region && connect_timeout == other.connect_timeout &&
+ return (smart_defaults == other.smart_defaults && region == other.region &&
+ connect_timeout == other.connect_timeout &&
request_timeout == other.request_timeout &&
endpoint_override == other.endpoint_override && scheme ==
other.scheme &&
role_arn == other.role_arn && session_name == other.session_name &&
@@ -1069,7 +1072,15 @@ class EndpointProviderCache {
class ClientBuilder {
public:
- explicit ClientBuilder(S3Options options) : options_(std::move(options)) {}
+ // Make sure the default S3ClientConfiguration constructor is never invoked
(see below)
+ ClientBuilder() = delete;
+
+ explicit ClientBuilder(S3Options options)
+ : options_(std::move(options)),
+ // The S3ClientConfiguration constructor always does EC2 metadata
lookups,
+ // unless IMDS is disabled (GH-46214).
+ client_config_(/*useSmartDefaults=*/true,
options_.smart_defaults.c_str(),
+ /*shouldDisableIMDS=*/true) {}
const Aws::Client::ClientConfiguration& config() const { return
client_config_; }
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 05451b2312..158d70a93f 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -96,6 +96,12 @@ class ARROW_EXPORT S3RetryStrategy {
/// Options for the S3FileSystem implementation.
struct ARROW_EXPORT S3Options {
+ /// \brief Smart defaults for option values
+ ///
+ /// The possible values for this setting are explained in the AWS docs:
+ ///
https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html
+ std::string smart_defaults = "standard";
+
/// \brief AWS region to connect to.
///
/// If unset, the AWS SDK will choose a default value. The exact algorithm
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc
b/cpp/src/arrow/filesystem/s3fs_test.cc
index 370f3b2685..f0a5d0e2e4 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -307,6 +307,7 @@ TEST_F(S3OptionsTest, FromUri) {
ASSERT_EQ(options.region, "");
ASSERT_EQ(options.scheme, "https");
ASSERT_EQ(options.endpoint_override, "");
+ ASSERT_EQ(options.smart_defaults, "standard");
ASSERT_EQ(path, "");
ASSERT_OK_AND_ASSIGN(options, S3Options::FromUri("s3:", &path));
@@ -330,6 +331,12 @@ TEST_F(S3OptionsTest, FromUri) {
ASSERT_EQ(options.endpoint_override, "");
ASSERT_EQ(path, "mybucket/foo/bar");
+ ASSERT_OK_AND_ASSIGN(
+ options, S3Options::FromUri(
+ "s3://?allow_bucket_creation=true&smart_defaults=legacy",
&path));
+ ASSERT_TRUE(options.allow_bucket_creation);
+ ASSERT_EQ(options.smart_defaults, "legacy");
+
// Region resolution with a well-known bucket
ASSERT_OK_AND_ASSIGN(
options, S3Options::FromUri("s3://aws-earth-mo-atmospheric-ukv-prd/",
&path));