This is an automated email from the ASF dual-hosted git repository.

kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git


The following commit(s) were added to refs/heads/main by this push:
     new 6080ca6d04 GH-46214: [C++] Improve S3 client initialization (#46723)
6080ca6d04 is described below

commit 6080ca6d04be6f65ffa8ceefa1a9e39a0663b7b9
Author: Antoine Pitrou <[email protected]>
AuthorDate: Wed Jun 25 04:33:38 2025 +0200

    GH-46214: [C++] Improve S3 client initialization (#46723)
    
    ### Rationale for this change
    
    The default constructor of the `S3ClientConfiguration` class in the AWS SDK 
issues spurious EC2 metadata requests, even though we later set up the 
configuration values ourselves.
    
    ### What changes are included in this PR?
    
    1. Avoid spurious EC2 metadata calls by disabling "IMDS" in the 
`S3ClientConfiguration` constructor
    2. Change the smart defaults from "legacy" to "standard" (see 
https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html)
    3. Let the user configure the smart defaults in `S3Options`
    
    Benchmarks on my local work machine:
    * on git main:
    ```python
    >>> %time fs.S3FileSystem(anonymous=True)
    CPU times: user 316 μs, sys: 1.01 ms, total: 1.33 ms
    Wall time: 2 s
    <pyarrow._s3fs.S3FileSystem at 0x7d2182772fb0>
    >>> %time fs.S3FileSystem(access_key='key', secret_key='secret')
    CPU times: user 1.43 ms, sys: 0 ns, total: 1.43 ms
    Wall time: 2 s
    <pyarrow._s3fs.S3FileSystem at 0x7d22602337b0>
    >>> %time fs.S3FileSystem()
    CPU times: user 6.72 ms, sys: 3.27 ms, total: 9.99 ms
    Wall time: 12 s
    <pyarrow._s3fs.S3FileSystem at 0x7d226003bc30>
    ```
    * on this PR:
    ```python
    >>> %time fs.S3FileSystem(anonymous=True)
    CPU times: user 199 μs, sys: 0 ns, total: 199 μs
    Wall time: 203 μs
    <pyarrow._s3fs.S3FileSystem at 0x7d6c401c01b0>
    >>> %time fs.S3FileSystem(access_key='key', secret_key='secret')
    CPU times: user 198 μs, sys: 10 μs, total: 208 μs
    Wall time: 212 μs
    <pyarrow._s3fs.S3FileSystem at 0x7d6c4b2e2c30>
    >>> %time fs.S3FileSystem()
    CPU times: user 13.5 ms, sys: 3.52 ms, total: 17.1 ms
    Wall time: 14 s
    <pyarrow._s3fs.S3FileSystem at 0x7d6c40158df0>
    ```
    ### Are these changes tested?
    
    By existing CI tests and configurations.
    
    ### Are there any user-facing changes?
    
    The default S3 settings are potentially changed. Hopefully this will not 
trigger any regression in behavior.
    
    * GitHub Issue: #46214
    
    Authored-by: Antoine Pitrou <[email protected]>
    Signed-off-by: Sutou Kouhei <[email protected]>
---
 cpp/src/arrow/filesystem/s3fs.cc      | 15 +++++++++++++--
 cpp/src/arrow/filesystem/s3fs.h       |  6 ++++++
 cpp/src/arrow/filesystem/s3fs_test.cc |  7 +++++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/cpp/src/arrow/filesystem/s3fs.cc b/cpp/src/arrow/filesystem/s3fs.cc
index 62daab249c..0477417150 100644
--- a/cpp/src/arrow/filesystem/s3fs.cc
+++ b/cpp/src/arrow/filesystem/s3fs.cc
@@ -398,6 +398,8 @@ Result<S3Options> S3Options::FromUri(const Uri& uri, 
std::string* out_path) {
     } else if (kv.first == "tls_verify_certificates") {
       ARROW_ASSIGN_OR_RAISE(options.tls_verify_certificates,
                             ::arrow::internal::ParseBoolean(kv.second));
+    } else if (kv.first == "smart_defaults") {
+      options.smart_defaults = kv.second;
     } else {
       return Status::Invalid("Unexpected query parameter in S3 URI: '", 
kv.first, "'");
     }
@@ -424,7 +426,8 @@ bool S3Options::Equals(const S3Options& other) const {
       default_metadata_size
           ? (other.default_metadata && 
other.default_metadata->Equals(*default_metadata))
           : (!other.default_metadata || other.default_metadata->size() == 0);
-  return (region == other.region && connect_timeout == other.connect_timeout &&
+  return (smart_defaults == other.smart_defaults && region == other.region &&
+          connect_timeout == other.connect_timeout &&
           request_timeout == other.request_timeout &&
           endpoint_override == other.endpoint_override && scheme == 
other.scheme &&
           role_arn == other.role_arn && session_name == other.session_name &&
@@ -1069,7 +1072,15 @@ class EndpointProviderCache {
 
 class ClientBuilder {
  public:
-  explicit ClientBuilder(S3Options options) : options_(std::move(options)) {}
+  // Make sure the default S3ClientConfiguration constructor is never invoked 
(see below)
+  ClientBuilder() = delete;
+
+  explicit ClientBuilder(S3Options options)
+      : options_(std::move(options)),
+        // The S3ClientConfiguration constructor always does EC2 metadata 
lookups,
+        // unless IMDS is disabled (GH-46214).
+        client_config_(/*useSmartDefaults=*/true, 
options_.smart_defaults.c_str(),
+                       /*shouldDisableIMDS=*/true) {}
 
   const Aws::Client::ClientConfiguration& config() const { return 
client_config_; }
 
diff --git a/cpp/src/arrow/filesystem/s3fs.h b/cpp/src/arrow/filesystem/s3fs.h
index 05451b2312..158d70a93f 100644
--- a/cpp/src/arrow/filesystem/s3fs.h
+++ b/cpp/src/arrow/filesystem/s3fs.h
@@ -96,6 +96,12 @@ class ARROW_EXPORT S3RetryStrategy {
 
 /// Options for the S3FileSystem implementation.
 struct ARROW_EXPORT S3Options {
+  /// \brief Smart defaults for option values
+  ///
+  /// The possible values for this setting are explained in the AWS docs:
+  /// 
https://docs.aws.amazon.com/sdkref/latest/guide/feature-smart-config-defaults.html
+  std::string smart_defaults = "standard";
+
   /// \brief AWS region to connect to.
   ///
   /// If unset, the AWS SDK will choose a default value.  The exact algorithm
diff --git a/cpp/src/arrow/filesystem/s3fs_test.cc 
b/cpp/src/arrow/filesystem/s3fs_test.cc
index 370f3b2685..f0a5d0e2e4 100644
--- a/cpp/src/arrow/filesystem/s3fs_test.cc
+++ b/cpp/src/arrow/filesystem/s3fs_test.cc
@@ -307,6 +307,7 @@ TEST_F(S3OptionsTest, FromUri) {
   ASSERT_EQ(options.region, "");
   ASSERT_EQ(options.scheme, "https");
   ASSERT_EQ(options.endpoint_override, "");
+  ASSERT_EQ(options.smart_defaults, "standard");
   ASSERT_EQ(path, "");
 
   ASSERT_OK_AND_ASSIGN(options, S3Options::FromUri("s3:", &path));
@@ -330,6 +331,12 @@ TEST_F(S3OptionsTest, FromUri) {
   ASSERT_EQ(options.endpoint_override, "");
   ASSERT_EQ(path, "mybucket/foo/bar");
 
+  ASSERT_OK_AND_ASSIGN(
+      options, S3Options::FromUri(
+                   "s3://?allow_bucket_creation=true&smart_defaults=legacy", 
&path));
+  ASSERT_TRUE(options.allow_bucket_creation);
+  ASSERT_EQ(options.smart_defaults, "legacy");
+
   // Region resolution with a well-known bucket
   ASSERT_OK_AND_ASSIGN(
       options, S3Options::FromUri("s3://aws-earth-mo-atmospheric-ukv-prd/", 
&path));

Reply via email to