This is an automated email from the ASF dual-hosted git repository.

hellostephen pushed a commit to branch branch-2.1
in repository https://gitbox.apache.org/repos/asf/doris.git


The following commit(s) were added to refs/heads/branch-2.1 by this push:
     new eddea8b309d [opt](hive docker)Parallel put hive data (#46571) (#46682)
eddea8b309d is described below

commit eddea8b309d706f66f802e9e171fd5d685a7f22b
Author: Thearas <gaozif...@selectdb.com>
AuthorDate: Thu Jan 9 14:08:35 2025 +0800

    [opt](hive docker)Parallel put hive data (#46571) (#46682)
    
    Problem Summary:
    Parallel put `tpch1.db`, `paimon1` and `tvf_data` hive data. Reduce the
    time cost from 22m to 16m on 16C machine.
    
    Change-Id: Ib75c57d397ce1f96d5108d4b570bcb215f31d421
---
 .../docker-compose/hive/scripts/hive-metastore.sh  | 49 ++++++++++++++--------
 1 file changed, 32 insertions(+), 17 deletions(-)

diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh 
b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
index 7ff6bc4c62f..6681a513066 100755
--- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
+++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh
@@ -18,6 +18,8 @@
 
 set -e -x
 
+parallel=$(getconf _NPROCESSORS_ONLN)
+
 nohup /opt/hive/bin/hive --service metastore &
 
 # wait metastore start
@@ -37,7 +39,7 @@ done
 touch "${lockfile1}"
 
 DATA_DIR="/mnt/scripts/data/"
-find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 10 -I {} 
sh -c '
+find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 
"${parallel}" -I {} sh -c '
     START_TIME=$(date +%s)
     chmod +x "{}" && "{}"
     END_TIME=$(date +%s)
@@ -92,45 +94,58 @@ fi
 rm -f "${lockfile2}"
 
 # put data file
+hadoop_put_pids=()
+hadoop fs -mkdir -p /user/doris/
+
+
 ## put tpch1
 if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then
     echo "tpch1.db does not exist"
     exit 1
 fi
-hadoop fs -mkdir -p /user/doris/
-hadoop fs -put /mnt/scripts/tpch1.db /user/doris/
-if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
-    echo "tpch1.db put failed"
-    exit 1
-fi
+hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ &
+hadoop_put_pids+=($!)
 
 ## put paimon1
 if [[ -z "$(ls /mnt/scripts/paimon1)" ]]; then
     echo "paimon1 does not exist"
     exit 1
 fi
-hadoop fs -put /mnt/scripts/paimon1 /user/doris/
-if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
-    echo "paimon1 put failed"
-    exit 1
-fi
+hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ &
+hadoop_put_pids+=($!)
 
 ## put tvf_data
 if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then
     echo "tvf_data does not exist"
     exit 1
 fi
-hadoop fs -put /mnt/scripts/tvf_data /user/doris/
+hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ &
+hadoop_put_pids+=($!)
+
+## put other preinstalled data
+hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ &
+hadoop_put_pids+=($!)
+
+
+# wait put finish
+set +e
+wait "${hadoop_put_pids[@]}"
+set -e
+if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then
+    echo "paimon1 put failed"
+    exit 1
+fi
+if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then
+    echo "tpch1.db put failed"
+    exit 1
+fi
 if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then
     echo "tvf_data put failed"
     exit 1
 fi
 
-## put other preinstalled data
-hadoop fs -put /mnt/scripts/preinstalled_data /user/doris/
-
 # create tables
-ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 10 -I {} 
bash -c '
+ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 
"${parallel}" -I {} bash -c '
     START_TIME=$(date +%s)
     hive -f {}
     END_TIME=$(date +%s)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org
For additional commands, e-mail: commits-h...@doris.apache.org

Reply via email to