This is an automated email from the ASF dual-hosted git repository.
pdallig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zeppelin.git
The following commit(s) were added to refs/heads/master by this push:
new b6e40d4224 [ZEPPELIN-6157] Download artifacts from CDN if available
b6e40d4224 is described below
commit b6e40d4224a274d96834e75453724451f9e557fe
Author: Doroszlai, Attila <[email protected]>
AuthorDate: Thu Dec 19 10:19:39 2024 +0100
[ZEPPELIN-6157] Download artifacts from CDN if available
## What changes were proposed in this pull request?
Current artifacts available in CDN (`dlcdn.apache.org`) may get removed
without notice when new releases appear. To avoid broken links, build scripts
contain permanent addresses from `archive.apache.org`. But download from
`archive.apache.org` may be slow:
```
Thu, 05 Dec 2024 08:39:53 GMT [INFO] --- download:1.6.0:wget
(download-sparkr-files) <at> r ---
Thu, 05 Dec 2024 08:39:54 GMT Warning: No signatures were supplied,
skipping file validation
Thu, 05 Dec 2024 08:39:54 GMT [INFO] Read Timeout is set to 60000
milliseconds (apprx 1 minutes)
Thu, 05 Dec 2024 08:45:46 GMT [INFO] Expanding:
/home/runner/work/zeppelin/zeppelin/rlang/target/spark-3.5.3-bin-without-hadoop.tgz
into /home/runner/work/zeppelin/zeppelin/rlang/target
```
Apache Infra's [`closer.lua`
script](https://infra.apache.org/release-download-pages.html#closer) can
redirect to CDN or archive, depending on artifact availability.
This change replaces `archive.apache.org` URLs, and one instance of
`dist.apache.org`, with their `closer.lua` equivalent. Output filename has to
be specified for `wget` unfortunately.
https://issues.apache.org/jira/browse/ZEPPELIN-6157
## How was this patch tested?
Tried some of the URLs locally, both from CLI (`curl -L --head`) and
regular build (`mvn -DskipTests clean package`).
Full CI:
- quick: https://github.com/adoroszlai/zeppelin/actions/runs/12319072153
- frontend: https://github.com/adoroszlai/zeppelin/actions/runs/12319072142
- core: https://github.com/adoroszlai/zeppelin/actions/runs/12319072156
Closes #4901 from adoroszlai/ZEPPELIN-6157.
Signed-off-by: Philipp Dallig <[email protected]>
---
docs/quickstart/kubernetes.md | 2 +-
docs/setup/deployment/flink_and_spark_cluster.md | 4 ++--
flink/flink-scala-2.12/pom.xml | 3 ++-
rlang/pom.xml | 3 ++-
scripts/docker/interpreter/Dockerfile | 2 +-
.../docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile | 4 ++--
scripts/docker/zeppelin/bin/Dockerfile | 2 +-
spark/interpreter/pom.xml | 6 ++++--
spark/pom.xml | 4 ++--
testing/downloadLivy.sh | 6 ++++--
testing/downloadSpark.sh | 8 +++++---
11 files changed, 26 insertions(+), 18 deletions(-)
diff --git a/docs/quickstart/kubernetes.md b/docs/quickstart/kubernetes.md
index 470614f2f0..f60003f40a 100644
--- a/docs/quickstart/kubernetes.md
+++ b/docs/quickstart/kubernetes.md
@@ -179,7 +179,7 @@ $ mv zeppelin-distribution/target/zeppelin-*-bin.tgz
scripts/docker/zeppelin/bin
# Find following section and comment out
#RUN echo "$LOG_TAG Download Zeppelin binary" && \
-# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz
http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz
&& \
+# wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz
"https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download"
&& \
# tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
# rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
# mv /zeppelin-${Z_VERSION}-bin-all ${ZEPPELIN_HOME}
diff --git a/docs/setup/deployment/flink_and_spark_cluster.md
b/docs/setup/deployment/flink_and_spark_cluster.md
index df5df80d9a..070b2af0f5 100644
--- a/docs/setup/deployment/flink_and_spark_cluster.md
+++ b/docs/setup/deployment/flink_and_spark_cluster.md
@@ -215,7 +215,7 @@ Building from source is recommended where possible, for
simplicity in this tuto
To download the Flink Binary use `wget`
```bash
-wget
"https://archive.apache.org/dist/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz"
+wget -O flink-1.17.1-bin-scala_2.12.tgz
"https://www.apache.org/dyn/closer.lua/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz?action=download"
tar -xzvf flink-1.17.1-bin-scala_2.12.tgz
```
@@ -285,7 +285,7 @@ Using binaries is also
To download the Spark Binary use `wget`
```bash
-wget
"https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz"
+wget -O spark-3.5.2-bin-hadoop3.tgz
"https://www.apache.org/dyn/closer.lua/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz?action=download"
tar -xzvf spark-3.5.2-bin-hadoop3.tgz
mv spark-3.5.2-bin-hadoop3 spark
```
diff --git a/flink/flink-scala-2.12/pom.xml b/flink/flink-scala-2.12/pom.xml
index f1939861c1..e624f0d3fb 100644
--- a/flink/flink-scala-2.12/pom.xml
+++ b/flink/flink-scala-2.12/pom.xml
@@ -42,7 +42,7 @@
<derby.version>10.14.2.0</derby.version>
<hiverunner.version>5.3.0</hiverunner.version>
-
<flink.bin.download.url>https://archive.apache.org/dist/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</flink.bin.download.url>
+
<flink.bin.download.url>https://www.apache.org/dyn/closer.lua/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz?action=download</flink.bin.download.url>
</properties>
<dependencies>
@@ -1056,6 +1056,7 @@
<url>${flink.bin.download.url}</url>
<unpack>true</unpack>
<outputDirectory>${project.build.directory}</outputDirectory>
+
<outputFileName>flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</outputFileName>
</configuration>
</execution>
</executions>
diff --git a/rlang/pom.xml b/rlang/pom.xml
index f70af86c8f..38852e39b3 100644
--- a/rlang/pom.xml
+++ b/rlang/pom.xml
@@ -38,7 +38,7 @@
<spark.archive>spark-${spark.version}</spark.archive>
<spark.bin.download.url>
-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
</spark.bin.download.url>
<interpreter.jar.name>zeppelin-interpreter-r</interpreter.jar.name>
</properties>
@@ -154,6 +154,7 @@
<url>${spark.bin.download.url}</url>
<unpack>true</unpack>
<outputDirectory>${project.build.directory}</outputDirectory>
+
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
</configuration>
</execution>
</executions>
diff --git a/scripts/docker/interpreter/Dockerfile
b/scripts/docker/interpreter/Dockerfile
index ab7f9668e1..2de94c88c7 100644
--- a/scripts/docker/interpreter/Dockerfile
+++ b/scripts/docker/interpreter/Dockerfile
@@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y curl unzip wget grep
sed vim tzdata &&
RUN rm -rf /opt/zeppelin
RUN rm -rf /spark
-RUN wget
https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
+RUN wget -O spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz?action=download"
RUN tar zxvf spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
RUN mv spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME} /opt/spark
RUN rm spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
diff --git
a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
index da3df1c10d..01b15308fd 100644
--- a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
+++ b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
@@ -42,7 +42,7 @@ ENV PATH $PATH:$JAVA_HOME/bin
RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync
# hadoop
-RUN curl -s
https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
| tar -xz -C /usr/local/
+RUN curl -s
"https://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz?action=download"
| tar -xz -C /usr/local/
RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop
ENV HADOOP_PREFIX /usr/local/hadoop
@@ -72,7 +72,7 @@ RUN rm /usr/local/hadoop/lib/native/*
RUN curl -Ls
http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar
-x -C /usr/local/hadoop/lib/native/
# install spark
-RUN curl -s
http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz
| tar -xz -C /usr/local/
+RUN curl -s
"https://www.apache.org/dyn/closer.lua/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz?action=download"
| tar -xz -C /usr/local/
RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark
ENV SPARK_HOME /usr/local/spark
diff --git a/scripts/docker/zeppelin/bin/Dockerfile
b/scripts/docker/zeppelin/bin/Dockerfile
index a04c077a08..40a3026711 100644
--- a/scripts/docker/zeppelin/bin/Dockerfile
+++ b/scripts/docker/zeppelin/bin/Dockerfile
@@ -65,7 +65,7 @@ ENV PATH
/opt/conda/envs/python_3_with_R/bin:/opt/conda/bin:$PATH
RUN echo "$LOG_TAG Download Zeppelin binary" && \
mkdir -p ${ZEPPELIN_HOME} && \
- wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz
https://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz
&& \
+ wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz
"https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download"
&& \
tar --strip-components=1 -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz -C
${ZEPPELIN_HOME} && \
rm -f /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
chown -R root:root ${ZEPPELIN_HOME} && \
diff --git a/spark/interpreter/pom.xml b/spark/interpreter/pom.xml
index 2fbfc042b7..f77ca36017 100644
--- a/spark/interpreter/pom.xml
+++ b/spark/interpreter/pom.xml
@@ -48,10 +48,10 @@
<spark.archive>spark-${spark.version}</spark.archive>
<spark.src.download.url>
-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
</spark.src.download.url>
<spark.bin.download.url>
-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
</spark.bin.download.url>
<scala.compile.version>${spark.scala.version}</scala.compile.version>
@@ -280,6 +280,7 @@
<unpack>true</unpack>
<url>${spark.src.download.url}</url>
<outputDirectory>${project.build.directory}</outputDirectory>
+ <outputFileName>${spark.archive}.tgz</outputFileName>
</configuration>
</execution>
<!-- include sparkr by default -->
@@ -295,6 +296,7 @@
<url>${spark.bin.download.url}</url>
<unpack>true</unpack>
<outputDirectory>${project.build.directory}</outputDirectory>
+
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
</configuration>
</execution>
</executions>
diff --git a/spark/pom.xml b/spark/pom.xml
index 9e5c973811..5f122432d8 100644
--- a/spark/pom.xml
+++ b/spark/pom.xml
@@ -45,10 +45,10 @@
<spark.archive>spark-${spark.version}</spark.archive>
<spark.src.download.url>
-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
</spark.src.download.url>
<spark.bin.download.url>
-
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
</spark.bin.download.url>
</properties>
diff --git a/testing/downloadLivy.sh b/testing/downloadLivy.sh
index f09837a757..fadd9973ee 100755
--- a/testing/downloadLivy.sh
+++ b/testing/downloadLivy.sh
@@ -45,12 +45,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
# None
# Arguments:
# url - source URL
+# file - output filename
# Returns:
# None
#######################################
download_with_retry() {
local url="$1"
- wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3
"${url}"
+ local file="${2:-$(basename $url)}"
+ wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3
--output-document "${file}" "${url}"
if [[ "$?" -ne 0 ]]; then
echo "3 download attempts for ${url} failed"
fi
@@ -72,7 +74,7 @@ if [[ ! -d "${LIVY_HOME}" ]]; then
# download livy from archive if not cached
echo "${LIVY_VERSION} being downloaded from archives"
STARTTIME=`date +%s`
- download_with_retry
"https://dist.apache.org/repos/dist/release/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip"
+ download_with_retry
"https://www.apache.org/dyn/closer.lua/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip?action=download"
"${LIVY_ARCHIVE}.zip"
ENDTIME=`date +%s`
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
fi
diff --git a/testing/downloadSpark.sh b/testing/downloadSpark.sh
index 9c19e82bbc..118097b000 100755
--- a/testing/downloadSpark.sh
+++ b/testing/downloadSpark.sh
@@ -38,12 +38,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
# None
# Arguments:
# url - source URL
+# file - output filename
# Returns:
# None
#######################################
download_with_retry() {
local url="$1"
- wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3
"${url}"
+ local file="${2:-$(basename $url)}"
+ wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3
--output-document "${file}" "${url}"
if [[ "$?" -ne 0 ]]; then
echo "3 download attempts for ${url} failed"
fi
@@ -65,8 +67,8 @@ if [[ ! -d "${SPARK_HOME}" ]]; then
# download spark from archive if not cached
echo "${SPARK_VERSION} being downloaded from archives"
STARTTIME=`date +%s`
- #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget
"http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
- download_with_retry
"http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz"
+ #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget -O
"${SPARK_ARCHIVE}.tgz"
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download"
+ download_with_retry
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download"
"${SPARK_ARCHIVE}.tgz"
ENDTIME=`date +%s`
DOWNLOADTIME="$((ENDTIME-STARTTIME))"
fi