This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/spark.git
The following commit(s) were added to refs/heads/master by this push:
new d5c8dfc4666 [SPARK-45404][CORE] Support `AWS_ENDPOINT_URL` env variable
d5c8dfc4666 is described below
commit d5c8dfc466665f9794df9d6eba368720c0acffb2
Author: Dongjoon Hyun <[email protected]>
AuthorDate: Wed Oct 4 09:40:44 2023 -0700
[SPARK-45404][CORE] Support `AWS_ENDPOINT_URL` env variable
### What changes were proposed in this pull request?
This PR aims to support `AWS_ENDPOINT_URL` in addition to the existing
`AWS_*` environment variables.
### Why are the changes needed?
To improve the usability. This is useful when we use S3-compatible object
storages.
-
https://docs.aws.amazon.com/sdkref/latest/guide/settings-reference.html#EVarSettings
### Does this PR introduce _any_ user-facing change?
This is a new feature. If a user have a misconfigured `AWS_ENDPOINT_URL`,
it will hang or mostly be denied during the authentication.
### How was this patch tested?
Pass the CIs with newly updated test case.
### Was this patch authored or co-authored using generative AI tooling?
No.
Closes #43205 from dongjoon-hyun/SPARK-45404.
Authored-by: Dongjoon Hyun <[email protected]>
Signed-off-by: Dongjoon Hyun <[email protected]>
---
.../scala/org/apache/spark/deploy/SparkHadoopUtil.scala | 11 +++++++++++
.../org/apache/spark/deploy/SparkHadoopUtilSuite.scala | 13 +++++++++++--
docs/cloud-integration.md | 2 +-
hadoop-cloud/README.md | 2 +-
4 files changed, 24 insertions(+), 4 deletions(-)
diff --git a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
index 78434f35a63..0bb3dab62ed 100644
--- a/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
+++ b/core/src/main/scala/org/apache/spark/deploy/SparkHadoopUtil.scala
@@ -387,6 +387,11 @@ private[spark] object SparkHadoopUtil extends Logging {
* `EnvironmentVariableCredentialsProvider`; those are not propagated.
*/
+ /**
+ * AWS Endpoint URL.
+ */
+ private[deploy] val ENV_VAR_AWS_ENDPOINT_URL = "AWS_ENDPOINT_URL"
+
/**
* AWS Access key.
*/
@@ -436,6 +441,7 @@ private[spark] object SparkHadoopUtil extends Logging {
// the behavior of the old implementation of this code, for backwards
compatibility.
if (conf != null) {
appendS3CredentialsFromEnvironment(hadoopConf,
+ System.getenv(ENV_VAR_AWS_ENDPOINT_URL),
System.getenv(ENV_VAR_AWS_ACCESS_KEY),
System.getenv(ENV_VAR_AWS_SECRET_KEY),
System.getenv(ENV_VAR_AWS_SESSION_TOKEN))
@@ -463,6 +469,7 @@ private[spark] object SparkHadoopUtil extends Logging {
// Exposed for testing
private[deploy] def appendS3CredentialsFromEnvironment(
hadoopConf: Configuration,
+ endpointUrl: String,
keyId: String,
accessKey: String,
sessionToken: String): Unit = {
@@ -476,6 +483,10 @@ private[spark] object SparkHadoopUtil extends Logging {
hadoopConf.set("fs.s3n.awsSecretAccessKey", accessKey, source +
ENV_VAR_AWS_SECRET_KEY)
hadoopConf.set("fs.s3a.secret.key", accessKey, source +
ENV_VAR_AWS_SECRET_KEY)
+ if (endpointUrl != null) {
+ hadoopConf.set("fs.s3a.endpoint", endpointUrl, source +
ENV_VAR_AWS_ENDPOINT_URL)
+ }
+
// look for session token if the other variables were set
if (sessionToken != null) {
hadoopConf.set("fs.s3a.session.token", sessionToken,
diff --git
a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
index 6250b7d0ed2..2326d10d416 100644
--- a/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
+++ b/core/src/test/scala/org/apache/spark/deploy/SparkHadoopUtilSuite.scala
@@ -110,8 +110,9 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
test("SPARK-40640: aws credentials from environment variables") {
val hadoopConf = new Configuration(false)
SparkHadoopUtil.appendS3CredentialsFromEnvironment(hadoopConf,
- "access-key", "secret-key", "session-token")
+ "endpoint", "access-key", "secret-key", "session-token")
val source = "Set by Spark on " + InetAddress.getLocalHost + " from "
+ assertConfigMatches(hadoopConf, "fs.s3a.endpoint", "endpoint", source)
assertConfigMatches(hadoopConf, "fs.s3a.access.key", "access-key", source)
assertConfigMatches(hadoopConf, "fs.s3a.secret.key", "secret-key", source)
assertConfigMatches(hadoopConf, "fs.s3a.session.token", "session-token",
source)
@@ -119,10 +120,18 @@ class SparkHadoopUtilSuite extends SparkFunSuite {
test("SPARK-19739: S3 session token propagation requires access and secret
keys") {
val hadoopConf = new Configuration(false)
- SparkHadoopUtil.appendS3CredentialsFromEnvironment(hadoopConf, null, null,
"session-token")
+ SparkHadoopUtil.appendS3CredentialsFromEnvironment(
+ hadoopConf, null, null, null, "session-token")
assertConfigValue(hadoopConf, "fs.s3a.session.token", null)
}
+ test("SPARK-45404: aws endpoint propagation requires access and secret
keys") {
+ val hadoopConf = new Configuration(false)
+ SparkHadoopUtil.appendS3CredentialsFromEnvironment(
+ hadoopConf, "endpoint", null, null, null)
+ assertConfigValue(hadoopConf, "fs.s3a.endpoint", null)
+ }
+
test("substituteHadoopVariables") {
val hadoopConf = new Configuration(false)
hadoopConf.set("xxx", "yyy")
diff --git a/docs/cloud-integration.md b/docs/cloud-integration.md
index 06342645e6d..6b376c2f95c 100644
--- a/docs/cloud-integration.md
+++ b/docs/cloud-integration.md
@@ -121,7 +121,7 @@ for talking to cloud infrastructures, in which case this
module may not be neede
Spark jobs must authenticate with the object stores to access data within them.
1. When Spark is running in a cloud infrastructure, the credentials are
usually automatically set up.
-1. `spark-submit` reads the `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`
+1. `spark-submit` is able to read the `AWS_ENDPOINT_URL`, `AWS_ACCESS_KEY_ID`,
`AWS_SECRET_ACCESS_KEY`
and `AWS_SESSION_TOKEN` environment variables and sets the associated
authentication options
for the `s3n` and `s3a` connectors to Amazon S3.
1. In a Hadoop cluster, settings may be set in the `core-site.xml` file.
diff --git a/hadoop-cloud/README.md b/hadoop-cloud/README.md
index 0be167e6ef8..840ff1576f5 100644
--- a/hadoop-cloud/README.md
+++ b/hadoop-cloud/README.md
@@ -17,4 +17,4 @@ run the test against. Those configs are passed as environment
variables and the
variables must be checked by the test.
Like for `AwsS3AbortableStreamBasedCheckpointFileManagerSuite` the S3 bucket
used for testing
is passed in the `S3A_PATH` and the credetinals to access AWS S3 are
AWS_ACCESS_KEY_ID and
-AWS_SECRET_ACCESS_KEY (in addition you can define an optional
AWS_SESSION_TOKEN too).
+AWS_SECRET_ACCESS_KEY (in addition you can define optional AWS_SESSION_TOKEN
and AWS_ENDPOINT_URL too).
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]