This is an automated email from the ASF dual-hosted git repository. hellostephen pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new eddea8b309d [opt](hive docker)Parallel put hive data (#46571) (#46682) eddea8b309d is described below commit eddea8b309d706f66f802e9e171fd5d685a7f22b Author: Thearas <gaozif...@selectdb.com> AuthorDate: Thu Jan 9 14:08:35 2025 +0800 [opt](hive docker)Parallel put hive data (#46571) (#46682) Problem Summary: Parallel put `tpch1.db`, `paimon1` and `tvf_data` hive data. Reduce the time cost from 22m to 16m on 16C machine. Change-Id: Ib75c57d397ce1f96d5108d4b570bcb215f31d421 --- .../docker-compose/hive/scripts/hive-metastore.sh | 49 ++++++++++++++-------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index 7ff6bc4c62f..6681a513066 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -18,6 +18,8 @@ set -e -x +parallel=$(getconf _NPROCESSORS_ONLN) + nohup /opt/hive/bin/hive --service metastore & # wait metastore start @@ -37,7 +39,7 @@ done touch "${lockfile1}" DATA_DIR="/mnt/scripts/data/" -find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 10 -I {} sh -c ' +find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P "${parallel}" -I {} sh -c ' START_TIME=$(date +%s) chmod +x "{}" && "{}" END_TIME=$(date +%s) @@ -92,45 +94,58 @@ fi rm -f "${lockfile2}" # put data file +hadoop_put_pids=() +hadoop fs -mkdir -p /user/doris/ + + ## put tpch1 if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then echo "tpch1.db does not exist" exit 1 fi -hadoop fs -mkdir -p /user/doris/ -hadoop fs -put /mnt/scripts/tpch1.db /user/doris/ -if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then - echo "tpch1.db put failed" - exit 1 -fi +hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ & +hadoop_put_pids+=($!) ## put paimon1 if [[ -z "$(ls /mnt/scripts/paimon1)" ]]; then echo "paimon1 does not exist" exit 1 fi -hadoop fs -put /mnt/scripts/paimon1 /user/doris/ -if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then - echo "paimon1 put failed" - exit 1 -fi +hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ & +hadoop_put_pids+=($!) ## put tvf_data if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then echo "tvf_data does not exist" exit 1 fi -hadoop fs -put /mnt/scripts/tvf_data /user/doris/ +hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ & +hadoop_put_pids+=($!) + +## put other preinstalled data +hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ & +hadoop_put_pids+=($!) + + +# wait put finish +set +e +wait "${hadoop_put_pids[@]}" +set -e +if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then + echo "paimon1 put failed" + exit 1 +fi +if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then + echo "tpch1.db put failed" + exit 1 +fi if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then echo "tvf_data put failed" exit 1 fi -## put other preinstalled data -hadoop fs -put /mnt/scripts/preinstalled_data /user/doris/ - # create tables -ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 10 -I {} bash -c ' +ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P "${parallel}" -I {} bash -c ' START_TIME=$(date +%s) hive -f {} END_TIME=$(date +%s) --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org