(zeppelin) branch master updated: [ZEPPELIN-6157] Download artifacts from CDN if available

pdallig Thu, 19 Dec 2024 01:21:24 -0800

This is an automated email from the ASF dual-hosted git repository.

pdallig pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/zeppelin.git



The following commit(s) were added to refs/heads/master by this push:
     new b6e40d4224 [ZEPPELIN-6157] Download artifacts from CDN if available
b6e40d4224 is described below

commit b6e40d4224a274d96834e75453724451f9e557fe
Author: Doroszlai, Attila <6454655+adorosz...@users.noreply.github.com>
AuthorDate: Thu Dec 19 10:19:39 2024 +0100

    [ZEPPELIN-6157] Download artifacts from CDN if available
    
    ## What changes were proposed in this pull request?
    
    Current artifacts available in CDN (`dlcdn.apache.org`) may get removed 
without notice when new releases appear.  To avoid broken links, build scripts 
contain permanent addresses from `archive.apache.org`.  But download from 
`archive.apache.org` may be slow:
    
    ```
    Thu, 05 Dec 2024 08:39:53 GMT [INFO] --- download:1.6.0:wget 
(download-sparkr-files) <at> r ---
    Thu, 05 Dec 2024 08:39:54 GMT Warning:  No signatures were supplied, 
skipping file validation
    Thu, 05 Dec 2024 08:39:54 GMT [INFO] Read Timeout is set to 60000 
milliseconds (apprx 1 minutes)
    Thu, 05 Dec 2024 08:45:46 GMT [INFO] Expanding: 
/home/runner/work/zeppelin/zeppelin/rlang/target/spark-3.5.3-bin-without-hadoop.tgz
 into /home/runner/work/zeppelin/zeppelin/rlang/target
    ```
    
    Apache Infra's [`closer.lua` 
script](https://infra.apache.org/release-download-pages.html#closer) can 
redirect to CDN or archive, depending on artifact availability.
    
    This change replaces `archive.apache.org` URLs, and one instance of 
`dist.apache.org`, with their `closer.lua` equivalent.  Output filename has to 
be specified for `wget` unfortunately.
    
    https://issues.apache.org/jira/browse/ZEPPELIN-6157
    
    ## How was this patch tested?
    
    Tried some of the URLs locally, both from CLI (`curl -L --head`) and 
regular build (`mvn -DskipTests clean package`).
    
    Full CI:
    - quick: https://github.com/adoroszlai/zeppelin/actions/runs/12319072153
    - frontend: https://github.com/adoroszlai/zeppelin/actions/runs/12319072142
    - core: https://github.com/adoroszlai/zeppelin/actions/runs/12319072156
    
    Closes #4901 from adoroszlai/ZEPPELIN-6157.
    
    Signed-off-by: Philipp Dallig <philipp.dal...@gmail.com>
---
 docs/quickstart/kubernetes.md                                     | 2 +-
 docs/setup/deployment/flink_and_spark_cluster.md                  | 4 ++--
 flink/flink-scala-2.12/pom.xml                                    | 3 ++-
 rlang/pom.xml                                                     | 3 ++-
 scripts/docker/interpreter/Dockerfile                             | 2 +-
 .../docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile   | 4 ++--
 scripts/docker/zeppelin/bin/Dockerfile                            | 2 +-
 spark/interpreter/pom.xml                                         | 6 ++++--
 spark/pom.xml                                                     | 4 ++--
 testing/downloadLivy.sh                                           | 6 ++++--
 testing/downloadSpark.sh                                          | 8 +++++---
 11 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/docs/quickstart/kubernetes.md b/docs/quickstart/kubernetes.md
index 470614f2f0..f60003f40a 100644
--- a/docs/quickstart/kubernetes.md
+++ b/docs/quickstart/kubernetes.md
@@ -179,7 +179,7 @@ $ mv zeppelin-distribution/target/zeppelin-*-bin.tgz 
scripts/docker/zeppelin/bin
 
 # Find following section and comment out
 #RUN echo "$LOG_TAG Download Zeppelin binary" && \
-#    wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz 
http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz
 && \
+#    wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz 
"https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download";
 && \
 #    tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
 #    rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
 #    mv /zeppelin-${Z_VERSION}-bin-all ${ZEPPELIN_HOME}
diff --git a/docs/setup/deployment/flink_and_spark_cluster.md 
b/docs/setup/deployment/flink_and_spark_cluster.md
index df5df80d9a..070b2af0f5 100644
--- a/docs/setup/deployment/flink_and_spark_cluster.md
+++ b/docs/setup/deployment/flink_and_spark_cluster.md
@@ -215,7 +215,7 @@ Building from source is recommended  where possible, for 
simplicity in this tuto
 To download the Flink Binary use `wget`
 
 ```bash
-wget 
"https://archive.apache.org/dist/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz";
+wget -O flink-1.17.1-bin-scala_2.12.tgz 
"https://www.apache.org/dyn/closer.lua/flink/flink-1.17.1/flink-1.17.1-bin-scala_2.12.tgz?action=download";
 tar -xzvf flink-1.17.1-bin-scala_2.12.tgz
 ```
 
@@ -285,7 +285,7 @@ Using binaries is also
 To download the Spark Binary use `wget`
 
 ```bash
-wget 
"https://archive.apache.org/dist/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz";
+wget -O spark-3.5.2-bin-hadoop3.tgz 
"https://www.apache.org/dyn/closer.lua/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz?action=download";
 tar -xzvf spark-3.5.2-bin-hadoop3.tgz
 mv spark-3.5.2-bin-hadoop3 spark
 ```
diff --git a/flink/flink-scala-2.12/pom.xml b/flink/flink-scala-2.12/pom.xml
index f1939861c1..e624f0d3fb 100644
--- a/flink/flink-scala-2.12/pom.xml
+++ b/flink/flink-scala-2.12/pom.xml
@@ -42,7 +42,7 @@
     <derby.version>10.14.2.0</derby.version>
     <hiverunner.version>5.3.0</hiverunner.version>
 
-    
<flink.bin.download.url>https://archive.apache.org/dist/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</flink.bin.download.url>
+    
<flink.bin.download.url>https://www.apache.org/dyn/closer.lua/flink/flink-${flink.version}/flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz?action=download</flink.bin.download.url>
   </properties>
   
   <dependencies>
@@ -1056,6 +1056,7 @@
               <url>${flink.bin.download.url}</url>
               <unpack>true</unpack>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              
<outputFileName>flink-${flink.version}-bin-scala_${flink.scala.binary.version}.tgz</outputFileName>
             </configuration>
           </execution>
         </executions>
diff --git a/rlang/pom.xml b/rlang/pom.xml
index f70af86c8f..38852e39b3 100644
--- a/rlang/pom.xml
+++ b/rlang/pom.xml
@@ -38,7 +38,7 @@
 
         <spark.archive>spark-${spark.version}</spark.archive>
         <spark.bin.download.url>
-            
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+            
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
         </spark.bin.download.url>
         <interpreter.jar.name>zeppelin-interpreter-r</interpreter.jar.name>
     </properties>
@@ -154,6 +154,7 @@
                             <url>${spark.bin.download.url}</url>
                             <unpack>true</unpack>
                             
<outputDirectory>${project.build.directory}</outputDirectory>
+                            
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
                         </configuration>
                     </execution>
                 </executions>
diff --git a/scripts/docker/interpreter/Dockerfile 
b/scripts/docker/interpreter/Dockerfile
index ab7f9668e1..2de94c88c7 100644
--- a/scripts/docker/interpreter/Dockerfile
+++ b/scripts/docker/interpreter/Dockerfile
@@ -30,7 +30,7 @@ RUN apt-get update && apt-get install -y curl unzip wget grep 
sed vim tzdata &&
 RUN rm -rf /opt/zeppelin
 
 RUN rm -rf /spark
-RUN wget 
https://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
+RUN wget -O spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz 
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz?action=download";
 RUN tar zxvf spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
 RUN mv spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME} /opt/spark
 RUN rm spark-${SPARK_VERSION}-bin-${SPARK_BIN_NAME}.tgz
diff --git 
a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile 
b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
index da3df1c10d..01b15308fd 100644
--- a/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
+++ b/scripts/docker/spark-cluster-managers/spark_yarn_cluster/Dockerfile
@@ -42,7 +42,7 @@ ENV PATH $PATH:$JAVA_HOME/bin
 RUN yum install -y curl which tar sudo openssh-server openssh-clients rsync
 
 # hadoop
-RUN curl -s 
https://archive.apache.org/dist/hadoop/core/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz
 | tar -xz -C /usr/local/
+RUN curl -s 
"https://www.apache.org/dyn/closer.lua/hadoop/common/hadoop-$HADOOP_VERSION/hadoop-$HADOOP_VERSION.tar.gz?action=download";
 | tar -xz -C /usr/local/
 RUN cd /usr/local && ln -s ./hadoop-$HADOOP_VERSION hadoop
 
 ENV HADOOP_PREFIX /usr/local/hadoop
@@ -72,7 +72,7 @@ RUN rm  /usr/local/hadoop/lib/native/*
 RUN curl -Ls 
http://dl.bintray.com/sequenceiq/sequenceiq-bin/hadoop-native-64-$HADOOP_VERSION.tar|tar
 -x -C /usr/local/hadoop/lib/native/
 
 # install spark
-RUN curl -s 
http://archive.apache.org/dist/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz
 | tar -xz -C /usr/local/
+RUN curl -s 
"https://www.apache.org/dyn/closer.lua/spark/spark-$SPARK_VERSION/spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE.tgz?action=download";
 | tar -xz -C /usr/local/
 RUN cd /usr/local && ln -s spark-$SPARK_VERSION-bin-hadoop$HADOOP_PROFILE spark
 ENV SPARK_HOME /usr/local/spark
 
diff --git a/scripts/docker/zeppelin/bin/Dockerfile 
b/scripts/docker/zeppelin/bin/Dockerfile
index a04c077a08..40a3026711 100644
--- a/scripts/docker/zeppelin/bin/Dockerfile
+++ b/scripts/docker/zeppelin/bin/Dockerfile
@@ -65,7 +65,7 @@ ENV PATH 
/opt/conda/envs/python_3_with_R/bin:/opt/conda/bin:$PATH
 
 RUN echo "$LOG_TAG Download Zeppelin binary" && \
     mkdir -p ${ZEPPELIN_HOME} && \
-    wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz 
https://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz
 && \
+    wget -nv -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz 
"https://www.apache.org/dyn/closer.lua/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz?action=download";
 && \
     tar --strip-components=1 -zxvf  /tmp/zeppelin-${Z_VERSION}-bin-all.tgz -C 
${ZEPPELIN_HOME} && \
     rm -f /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
     chown -R root:root ${ZEPPELIN_HOME} && \
diff --git a/spark/interpreter/pom.xml b/spark/interpreter/pom.xml
index 2fbfc042b7..f77ca36017 100644
--- a/spark/interpreter/pom.xml
+++ b/spark/interpreter/pom.xml
@@ -48,10 +48,10 @@
 
     <spark.archive>spark-${spark.version}</spark.archive>
     <spark.src.download.url>
-      
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+      
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
     </spark.src.download.url>
     <spark.bin.download.url>
-      
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+      
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
     </spark.bin.download.url>
 
     <scala.compile.version>${spark.scala.version}</scala.compile.version>
@@ -280,6 +280,7 @@
               <unpack>true</unpack>
               <url>${spark.src.download.url}</url>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              <outputFileName>${spark.archive}.tgz</outputFileName>
             </configuration>
           </execution>
           <!-- include sparkr by default -->
@@ -295,6 +296,7 @@
               <url>${spark.bin.download.url}</url>
               <unpack>true</unpack>
               <outputDirectory>${project.build.directory}</outputDirectory>
+              
<outputFileName>${spark.archive}-bin-without-hadoop.tgz</outputFileName>
             </configuration>
           </execution>
         </executions>
diff --git a/spark/pom.xml b/spark/pom.xml
index 9e5c973811..5f122432d8 100644
--- a/spark/pom.xml
+++ b/spark/pom.xml
@@ -45,10 +45,10 @@
 
         <spark.archive>spark-${spark.version}</spark.archive>
         <spark.src.download.url>
-            
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}.tgz
+            
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}.tgz?action=download
         </spark.src.download.url>
         <spark.bin.download.url>
-            
https://archive.apache.org/dist/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz
+            
https://www.apache.org/dyn/closer.lua/spark/${spark.archive}/${spark.archive}-bin-without-hadoop.tgz?action=download
         </spark.bin.download.url>
     </properties>
 
diff --git a/testing/downloadLivy.sh b/testing/downloadLivy.sh
index f09837a757..fadd9973ee 100755
--- a/testing/downloadLivy.sh
+++ b/testing/downloadLivy.sh
@@ -45,12 +45,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
 #   None
 # Arguments:
 #   url - source URL
+#   file - output filename
 # Returns:
 #   None
 #######################################
 download_with_retry() {
     local url="$1"
-    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 
"${url}"
+    local file="${2:-$(basename $url)}"
+    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 
--output-document "${file}" "${url}"
     if [[ "$?" -ne 0 ]]; then
         echo "3 download attempts for ${url} failed"
     fi
@@ -72,7 +74,7 @@ if [[ ! -d "${LIVY_HOME}" ]]; then
         # download livy from archive if not cached
         echo "${LIVY_VERSION} being downloaded from archives"
         STARTTIME=`date +%s`
-        download_with_retry 
"https://dist.apache.org/repos/dist/release/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip";
+        download_with_retry 
"https://www.apache.org/dyn/closer.lua/incubator/livy/${LIVY_VERSION}/${LIVY_ARCHIVE}.zip?action=download";
 "${LIVY_ARCHIVE}.zip"
         ENDTIME=`date +%s`
         DOWNLOADTIME="$((ENDTIME-STARTTIME))"
     fi
diff --git a/testing/downloadSpark.sh b/testing/downloadSpark.sh
index 9c19e82bbc..118097b000 100755
--- a/testing/downloadSpark.sh
+++ b/testing/downloadSpark.sh
@@ -38,12 +38,14 @@ ZEPPELIN_HOME="$(cd "${FWDIR}/.."; pwd)"
 #   None
 # Arguments:
 #   url - source URL
+#   file - output filename
 # Returns:
 #   None
 #######################################
 download_with_retry() {
     local url="$1"
-    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 
"${url}"
+    local file="${2:-$(basename $url)}"
+    wget --retry-connrefused --waitretry=1 --read-timeout=20 --timeout=15 -t 3 
--output-document "${file}" "${url}"
     if [[ "$?" -ne 0 ]]; then
         echo "3 download attempts for ${url} failed"
     fi
@@ -65,8 +67,8 @@ if [[ ! -d "${SPARK_HOME}" ]]; then
         # download spark from archive if not cached
         echo "${SPARK_VERSION} being downloaded from archives"
         STARTTIME=`date +%s`
-        #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget 
"http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz";
-        download_with_retry 
"http://archive.apache.org/dist/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz";
+        #timeout -s KILL "${MAX_DOWNLOAD_TIME_SEC}" wget -O 
"${SPARK_ARCHIVE}.tgz" 
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download";
+        download_with_retry 
"https://www.apache.org/dyn/closer.lua/spark/spark-${SPARK_VERSION}/${SPARK_ARCHIVE}.tgz?action=download";
 "${SPARK_ARCHIVE}.tgz"
         ENDTIME=`date +%s`
         DOWNLOADTIME="$((ENDTIME-STARTTIME))"
     fi

(zeppelin) branch master updated: [ZEPPELIN-6157] Download artifacts from CDN if available

Reply via email to