This is an automated email from the ASF dual-hosted git repository. hellostephen pushed a commit to branch master in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/master by this push: new d4648afb544 [opt](hive docker)Parallel put hive data (#46571) d4648afb544 is described below commit d4648afb5445360bdcb6148255eac41cd267429a Author: Thearas <gaozif...@selectdb.com> AuthorDate: Wed Jan 8 22:18:37 2025 +0800 [opt](hive docker)Parallel put hive data (#46571) ### What problem does this PR solve? Problem Summary: Parallel put `tpch1.db`, `paimon1` and `tvf_data` hive data. Reduce the time cost from 22m to 16m on 16C machine. --- .../docker-compose/hive/scripts/hive-metastore.sh | 49 ++++++++++++++-------- docker/thirdparties/run-thirdparties-docker.sh | 2 +- 2 files changed, 33 insertions(+), 18 deletions(-) diff --git a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh index 7ff6bc4c62f..6681a513066 100755 --- a/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh +++ b/docker/thirdparties/docker-compose/hive/scripts/hive-metastore.sh @@ -18,6 +18,8 @@ set -e -x +parallel=$(getconf _NPROCESSORS_ONLN) + nohup /opt/hive/bin/hive --service metastore & # wait metastore start @@ -37,7 +39,7 @@ done touch "${lockfile1}" DATA_DIR="/mnt/scripts/data/" -find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P 10 -I {} sh -c ' +find "${DATA_DIR}" -type f -name "run.sh" -print0 | xargs -0 -n 1 -P "${parallel}" -I {} sh -c ' START_TIME=$(date +%s) chmod +x "{}" && "{}" END_TIME=$(date +%s) @@ -92,45 +94,58 @@ fi rm -f "${lockfile2}" # put data file +hadoop_put_pids=() +hadoop fs -mkdir -p /user/doris/ + + ## put tpch1 if [[ -z "$(ls /mnt/scripts/tpch1.db)" ]]; then echo "tpch1.db does not exist" exit 1 fi -hadoop fs -mkdir -p /user/doris/ -hadoop fs -put /mnt/scripts/tpch1.db /user/doris/ -if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then - echo "tpch1.db put failed" - exit 1 -fi +hadoop fs -copyFromLocal -f /mnt/scripts/tpch1.db /user/doris/ & +hadoop_put_pids+=($!) ## put paimon1 if [[ -z "$(ls /mnt/scripts/paimon1)" ]]; then echo "paimon1 does not exist" exit 1 fi -hadoop fs -put /mnt/scripts/paimon1 /user/doris/ -if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then - echo "paimon1 put failed" - exit 1 -fi +hadoop fs -copyFromLocal -f /mnt/scripts/paimon1 /user/doris/ & +hadoop_put_pids+=($!) ## put tvf_data if [[ -z "$(ls /mnt/scripts/tvf_data)" ]]; then echo "tvf_data does not exist" exit 1 fi -hadoop fs -put /mnt/scripts/tvf_data /user/doris/ +hadoop fs -copyFromLocal -f /mnt/scripts/tvf_data /user/doris/ & +hadoop_put_pids+=($!) + +## put other preinstalled data +hadoop fs -copyFromLocal -f /mnt/scripts/preinstalled_data /user/doris/ & +hadoop_put_pids+=($!) + + +# wait put finish +set +e +wait "${hadoop_put_pids[@]}" +set -e +if [[ -z "$(hadoop fs -ls /user/doris/paimon1)" ]]; then + echo "paimon1 put failed" + exit 1 +fi +if [[ -z "$(hadoop fs -ls /user/doris/tpch1.db)" ]]; then + echo "tpch1.db put failed" + exit 1 +fi if [[ -z "$(hadoop fs -ls /user/doris/tvf_data)" ]]; then echo "tvf_data put failed" exit 1 fi -## put other preinstalled data -hadoop fs -put /mnt/scripts/preinstalled_data /user/doris/ - # create tables -ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P 10 -I {} bash -c ' +ls /mnt/scripts/create_preinstalled_scripts/*.hql | xargs -n 1 -P "${parallel}" -I {} bash -c ' START_TIME=$(date +%s) hive -f {} END_TIME=$(date +%s) diff --git a/docker/thirdparties/run-thirdparties-docker.sh b/docker/thirdparties/run-thirdparties-docker.sh index d00fdcea3aa..16c8b9e7d6b 100755 --- a/docker/thirdparties/run-thirdparties-docker.sh +++ b/docker/thirdparties/run-thirdparties-docker.sh @@ -614,7 +614,7 @@ start_minio() { fi } -echo "starting dockers in parrallel" +echo "starting dockers in parallel" declare -A pids --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org