This is an automated email from the ASF dual-hosted git repository.

hellostephen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/master by this push:
     new d4648afb544 [opt](hive docker)Parallel put hive data (#46571)
d4648afb544 is described below

commit d4648afb5445360bdcb6148255eac41cd267429a
Author: Thearas <gaozif...@selectdb.com>
AuthorDate: Wed Jan 8 22:18:37 2025 +0800

    [opt](hive docker)Parallel put hive data (#46571)
    
    ### What problem does this PR solve?
    
    Problem Summary:
    Parallel put `tpch1.db`, `paimon1` and `tvf_data` hive data. Reduce the
    time cost from 22m to 16m on 16C machine.
---
 .../docker-compose/hive/scripts/hive-metastore.sh  | 49 ++++++++++++++--------
 docker/thirdparties/run-thirdparties-docker.sh     |  2 +-
 2 files changed, 33 insertions(+), 18 deletions(-)

diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh 
b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
index 7ff6bc4c62f..6681a513066 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
+++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
@@ -18,6 +18,8 @@
 
 set -e -x
 
+parallel=$(getconf _NPROCESSORS_ONLN)
+
 nohup /opt/hive/bin/hive --service metastore &
 
 # wait metastore start
@@ -37,7 +39,7 @@ done
 touch "${lockfile1}"
 
 DATA_DIR="/mnt/scripts/data/"
-find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 10 -I {} 
sh -c '
+find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 
"${parallel}" -I {} sh -c '
     START_TIME=$(date +%s)
     chmod +x "{}" && "{}"
     END_TIME=$(date +%s)
@@ -92,45 +94,58 @@ fi
 rm -f "${lockfile2}"
 
 # put data file
+hadoop_put_pids=()
+hadoop fs -mkdir -p /user/doris/
+
+
 ## put tpch1
 if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then
     echo "tpch1.db does not exist"
     exit 1
 fi
-hadoop fs -mkdir -p /user/doris/
-hadoop fs -put /mnt/scripts/tpch1.db /user/doris/
-if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
-    echo "tpch1.db put failed"
-    exit 1
-fi
+hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ &
+hadoop_put_pids+=($!)
 
 ## put paimon1
 if [[ -z "$(ls /mnt/scripts/paimon1)" ]]; then
     echo "paimon1 does not exist"
     exit 1
 fi
-hadoop fs -put /mnt/scripts/paimon1 /user/doris/
-if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
-    echo "paimon1 put failed"
-    exit 1
-fi
+hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ &
+hadoop_put_pids+=($!)
 
 ## put tvf_data
 if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then
     echo "tvf_data does not exist"
     exit 1
 fi
-hadoop fs -put /mnt/scripts/tvf_data /user/doris/
+hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ &
+hadoop_put_pids+=($!)
+
+## put other preinstalled data
+hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ &
+hadoop_put_pids+=($!)
+
+
+# wait put finish
+set +e
+wait "${hadoop_put_pids[@]}"
+set -e
+if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
+    echo "paimon1 put failed"
+    exit 1
+fi
+if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
+    echo "tpch1.db put failed"
+    exit 1
+fi
 if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then
     echo "tvf_data put failed"
     exit 1
 fi
 
-## put other preinstalled data
-hadoop fs -put /mnt/scripts/preinstalled_data /user/doris/
-
 # create tables
-ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 10 -I {} 
bash -c '
+ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 
"${parallel}" -I {} bash -c '
     START_TIME=$(date +%s)
     hive -f {}
     END_TIME=$(date +%s)
diff --git a/docker/thirdparties/run-thirdparties-docker.sh 
b/docker/thirdparties/run-thirdparties-docker.sh
index d00fdcea3aa..16c8b9e7d6b 100755
--- a/docker/thirdparties/run-thirdparties-docker.sh
+++ b/docker/thirdparties/run-thirdparties-docker.sh
@@ -614,7 +614,7 @@ start_minio() {
     fi
 }
 
-echo "starting dockers in parrallel"
+echo "starting dockers in parallel"
 
 declare -A pids
 


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to