From 71cd53a3be7fc101049f2ca91bd78dd5793769d8 Mon Sep 17 00:00:00 2001 From: hysunflower Date: Wed, 12 Jan 2022 03:09:37 +0000 Subject: [PATCH 1/3] add gan-domm, test=document_fix --- OtherFrame/gan/PyTorch/fomm/README.md | 19 ++++++--------- OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh | 24 ++++++++++--------- .../gan/PyTorch/fomm/scripts/PrepareEnv.sh | 9 ++----- .../gan/PyTorch/fomm/scripts/analysis_log.py | 2 +- .../gan/PyTorch/fomm/scripts/run_benchmark.sh | 9 +++---- OtherFrame/scripts/auto_run.sh | 9 ++++++- 6 files changed, 36 insertions(+), 36 deletions(-) diff --git a/OtherFrame/gan/PyTorch/fomm/README.md b/OtherFrame/gan/PyTorch/fomm/README.md index 4a004361b4..6f689d208c 100644 --- a/OtherFrame/gan/PyTorch/fomm/README.md +++ b/OtherFrame/gan/PyTorch/fomm/README.md @@ -40,20 +40,15 @@ bash run_PyTorch.sh; # 创建容器,在该标准环境中测试模型 ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7"; docker pull ${ImageName} -#<<<<<<< gan_benchmark -#run_cmd="cd /workspace; -# cp /workspace/scripts/PrepareEnv.sh ./; -# bash PrepareEnv.sh; -# cd /workspace/first-order-model/; run_cmd="cp /workspace/scripts/PrepareEnv.sh ./; bash PrepareEnv.sh; cd /workspace/models/fomm; cp /workspace/scripts/run_benchmark.sh ./; cp /workspace/scripts/analysis_log.py ./; - CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm_sp_bs8 sp fp32 8 300 4; - CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm_sp_bs16 sp fp32 16 300 4; - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm_mp_bs32 mp fp32 8 300 4; - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm_mp_bs64 mp fp32 16 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM_sp_bs8 sp fp32 8 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM_sp_bs16 sp fp32 16 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM_mp_bs32 mp fp32 8 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM_mp_bs64 mp fp32 16 300 4; " nvidia-docker run --name test_torch_gan -i \ @@ -67,12 +62,12 @@ nvidia-docker rm test_torch_gan ## 输出 -执行完成后,在当前目录会产出分割模型训练性能数据的文件,比如`fomm_sp_bs8_fp32_1_speed`等文件,内容如下所示。 +执行完成后,在当前目录会产出分割模型训练性能数据的文件,比如`FOMM_sp_bs8_fp32_1_speed`等文件,内容如下所示。 ```bash { -"log_file": "/workspace/models/fomm/fomm_sp_bs8_fp32_1", \ # log 目录,创建规范见PrepareEnv.sh -"model_name": "fomm_sp_bs8", \ # 模型case名,创建规范:repoName_模型名_bs${bs_item}_${fp_item} +"log_file": "/workspace/models/fomm/FOMM_sp_bs8_fp32_1", \ # log 目录,创建规范见PrepareEnv.sh +"model_name": "FOMM_sp_bs8", \ # 模型case名,创建规范:repoName_模型名_bs${bs_item}_${fp_item} "mission_name": "图像生成", \ # 模型case所属任务名称,具体可参考scripts/config.ini "direction_id": 0, \ # 模型case所属方向id,0:CV|1:NLP|2:Rec 具体可参考benchmark/scripts/config.ini "run_mode": "sp", \ # 单卡:sp|多卡:mp diff --git a/OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh b/OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh index 3a23ccd126..f6c23bf01b 100644 --- a/OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh +++ b/OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh @@ -3,27 +3,29 @@ ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7"; docker pull ${ImageName} -#<<<<<<< gan_benchmark -#run_cmd="cd /workspace/; -# cp /workspace/scripts/PrepareEnv.sh ./; -# bash PrepareEnv.sh; -# cd /workspace/first-order-model; - run_cmd="cp /workspace/scripts/PrepareEnv.sh ./; bash PrepareEnv.sh; cd /workspace/models/fomm; cp /workspace/scripts/run_benchmark.sh ./; cp /workspace/scripts/analysis_log.py ./; - CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm sp fp32 8 300 4; - CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm sp fp32 16 300 4; - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm mp fp32 8 300 4; - CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm mp fp32 16 300 4; + sed -i '/set\ -xe/d' benchmark/run_benchmark.sh + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM sp fp32 8 300 4; + CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM sp fp32 16 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM mp fp32 8 300 4; + CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM mp fp32 16 300 4; " -nvidia-docker run --name test_torch_gan -it \ +nvidia-docker run --name test_torch_gan -i \ --net=host \ --shm-size=128g \ -v $PWD:/workspace \ + -v /ssd3:/ssd3 \ + -v /ssd2:/ssd2 \ + -e "ALL_PATH=${all_path}" \ + -v "BENCHMARK_ROOT=/workspace" \ + -e "http_proxy=${http_proxy}" \ + -e "https_proxy=${http_proxy}" \ + -e "no_proxy=bcebos.com" \ ${ImageName} /bin/bash -c "${run_cmd}" nvidia-docker stop test_torch_gan diff --git a/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh b/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh index 7bbd89be4e..c7afb4cb27 100644 --- a/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh +++ b/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh @@ -5,10 +5,7 @@ echo "*******prepare benchmark***********" ################################# 创建一些log目录,如: export BENCHMARK_ROOT=/workspace -log_date=`date "+%Y.%m%d.%H%M%S"` -frame=pytorch1.0.0 -cuda_version=10.2 -save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/ +save_log_dir=${BENCHMARK_ROOT}/logs/ if [[ -d ${save_log_dir} ]]; then rm -rf ${save_log_dir} @@ -31,9 +28,7 @@ export PATH=/workspace/run_env:${PATH} pip install -U pip echo `pip --version` -git clone https://github.com/lzzyzlbb/first-order-model -git checkout add_log -cd first-order-model +cd /workspace/models/fomm pip install -r requirements.txt imageio_download_bin ffmpeg diff --git a/OtherFrame/gan/PyTorch/fomm/scripts/analysis_log.py b/OtherFrame/gan/PyTorch/fomm/scripts/analysis_log.py index 8c807a5ed9..b5bf31c620 100644 --- a/OtherFrame/gan/PyTorch/fomm/scripts/analysis_log.py +++ b/OtherFrame/gan/PyTorch/fomm/scripts/analysis_log.py @@ -28,7 +28,7 @@ def analyze(model_name, log_file, res_log_file): total_time = 0 for i in range(skip_num, len(time_res)): total_time += float(time_res[i]) - ips = total_time / (len(time_res) - skip_num) + ips = round(total_time / (len(time_res) - skip_num), 3) info = {"log_file": log_file, "model_name": model_name, "mission_name": "图像生成", "direction_id": 0, "run_mode": run_mode, "index": 1, "gpu_num": gpu_num, diff --git a/OtherFrame/gan/PyTorch/fomm/scripts/run_benchmark.sh b/OtherFrame/gan/PyTorch/fomm/scripts/run_benchmark.sh index 2111805539..b99a8ced3a 100644 --- a/OtherFrame/gan/PyTorch/fomm/scripts/run_benchmark.sh +++ b/OtherFrame/gan/PyTorch/fomm/scripts/run_benchmark.sh @@ -3,10 +3,10 @@ set -xe # Test training benchmark for a model. -# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_name} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers} +# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_item} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers} function _set_params(){ - model_name=${1:-"model_name"} + model_item=${1:-"model_item"} run_mode=${2:-"sp"} # sp or mp fp_item=${3:-"fp32"} # fp32 or fp16 batch_size=${4:-"2"} @@ -17,8 +17,9 @@ function _set_params(){ device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} - log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} - res_log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed + log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices} + res_log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed + model_name=${model_item}_bs${batch_size}_${fp_item} } function _analysis_log(){ diff --git a/OtherFrame/scripts/auto_run.sh b/OtherFrame/scripts/auto_run.sh index 1f1ab0f235..e171171736 100644 --- a/OtherFrame/scripts/auto_run.sh +++ b/OtherFrame/scripts/auto_run.sh @@ -57,7 +57,7 @@ function set_env(){ -cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast) +cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast gan_torch_fomm) cur_mxnet_list=() cur_tensorflow_list=() @@ -113,6 +113,13 @@ detec_torch_fast(){ cp models/SOLO/*fp32_8 ${TRAIN_LOG_DIR} } +gan_torch_fomm(){ + cur_model_path=${ROOT_DIR}/gan/PyTorch/fomm + cd ${cur_model_path} + bash run_PyTorch.sh + cp ${cur_model_path}/logs/train_log/* ${TRAIN_LOG_DIR} + cp ${cur_model_path}/*speed ${LOG_PATH_INDEX_DIR} +} set_env for model_name in ${cur_torch_list[@]} do From 100aad15e908479cda79884d835e2046a93553a8 Mon Sep 17 00:00:00 2001 From: hysunflower Date: Wed, 12 Jan 2022 04:25:15 +0000 Subject: [PATCH 2/3] add gan-domm, test=document_fix --- OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh b/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh index c7afb4cb27..7c936ea1d9 100644 --- a/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh +++ b/OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh @@ -29,6 +29,7 @@ pip install -U pip echo `pip --version` cd /workspace/models/fomm +git checkout add_log pip install -r requirements.txt imageio_download_bin ffmpeg From a561f6e42af01959813bbc945e31bc847e76fe00 Mon Sep 17 00:00:00 2001 From: hysunflower Date: Wed, 12 Jan 2022 06:53:27 +0000 Subject: [PATCH 3/3] add edvr,basicvsr,esrgan, test=document_fix --- .../gan/PyTorch/mmedting/run_PyTorch.sh | 33 +++++++++++-------- .../PyTorch/mmedting/scripts/PrepareEnv.sh | 5 +-- .../PyTorch/mmedting/scripts/run_benchmark.sh | 22 +++++++------ OtherFrame/scripts/auto_run.sh | 13 ++++++-- 4 files changed, 44 insertions(+), 29 deletions(-) diff --git a/OtherFrame/gan/PyTorch/mmedting/run_PyTorch.sh b/OtherFrame/gan/PyTorch/mmedting/run_PyTorch.sh index 41b5b4c1f2..6b935d2c69 100644 --- a/OtherFrame/gan/PyTorch/mmedting/run_PyTorch.sh +++ b/OtherFrame/gan/PyTorch/mmedting/run_PyTorch.sh @@ -9,25 +9,32 @@ run_cmd="cp /workspace/scripts/PrepareEnv.sh ./; cp -r /workspace/mmedi_benchmark_configs ./; cp /workspace/scripts/run_benchmark.sh ./; cp /workspace/scripts/analysis_log.py ./; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs32 sp fp32 32 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs64 sp fp32 64 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs32 mp fp32 32 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs64 mp fp32 64 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs4 sp fp32 4 300 3; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs64 sp fp32 64 300 3; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs4 mp fp32 4 300 3; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs64 mp fp32 64 300 3; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs2 sp fp32 2 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs4 sp fp32 4 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs2 mp fp32 2 300 4; - PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs4 mp fp32 4 300 4; + sed -i '/set\ -xe/d' run_benchmark.sh + PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_bs32_fp32 sp fp32 32 300 4; + PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_bs32_fp32 mp fp32 32 300 4; + PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_bs4_fp32 sp fp32 4 300 3; + PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_bs4_fp32 mp fp32 4 300 3; + PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_bs2_fp32 sp fp32 2 300 4; + PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_bs4_fp32 sp fp32 4 300 4; + PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_bs2_fp32 mp fp32 2 300 4; + PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_bs4_fp32 mp fp32 4 300 4; " + #PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_bs64_fp32 sp fp32 64 300 4; + #PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_bs64_fp32 mp fp32 64 300 4; + #PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_bs64_fp32 sp fp32 64 300 3; + #PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_bs64_fp32 mp fp32 64 300 3; nvidia-docker run --name test_torch_gan -i \ --net=host \ --shm-size=128g \ -v $PWD:/workspace \ + -v /ssd2:/ssd2 \ + -e "ALL_PATH=${all_path}" \ + -v "BENCHMARK_ROOT=/workspace" \ + -e "http_proxy=${http_proxy}" \ + -e "https_proxy=${http_proxy}" \ + -e "no_proxy=bcebos.com" \ ${ImageName} /bin/bash -c "${run_cmd}" nvidia-docker stop test_torch_gan -nvidia-docker rm test_torch_gan \ No newline at end of file +nvidia-docker rm test_torch_gan diff --git a/OtherFrame/gan/PyTorch/mmedting/scripts/PrepareEnv.sh b/OtherFrame/gan/PyTorch/mmedting/scripts/PrepareEnv.sh index 3603b1f58b..6e0ab5dffb 100644 --- a/OtherFrame/gan/PyTorch/mmedting/scripts/PrepareEnv.sh +++ b/OtherFrame/gan/PyTorch/mmedting/scripts/PrepareEnv.sh @@ -5,10 +5,7 @@ echo "*******prepare benchmark***********" ################################# 创建一些log目录,如: export BENCHMARK_ROOT=/workspace -log_date=`date "+%Y.%m%d.%H%M%S"` -frame=pytorch1.9.0 -cuda_version=10.2 -save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/ +save_log_dir=${BENCHMARK_ROOT}/logs/ if [[ -d ${save_log_dir} ]]; then rm -rf ${save_log_dir} diff --git a/OtherFrame/gan/PyTorch/mmedting/scripts/run_benchmark.sh b/OtherFrame/gan/PyTorch/mmedting/scripts/run_benchmark.sh index a14e3c788b..17532cb301 100644 --- a/OtherFrame/gan/PyTorch/mmedting/scripts/run_benchmark.sh +++ b/OtherFrame/gan/PyTorch/mmedting/scripts/run_benchmark.sh @@ -17,8 +17,8 @@ function _set_params(){ device=${CUDA_VISIBLE_DEVICES//,/ } arr=(${device}) num_gpu_devices=${#arr[*]} - log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices} - res_log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices}_speed + log_file=${run_log_path}/${model_name}_${num_gpu_devices}_${run_mode} + res_log_file=${run_log_path}/${model_name}_${num_gpu_devices}_${run_mode}_speed } function _analysis_log(){ @@ -30,18 +30,20 @@ function _analysis_log(){ function _train(){ echo "Train ${model_name} on ${num_gpu_devices} GPUs" echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size" - - train_config="mmedi_benchmark_configs/${model_name}.py" + + + train_config="mmedi_benchmark_configs/${model_name%%_*}_${run_mode}_bs${batch_size}.py" train_options="--no-validate " case ${run_mode} in sp) train_cmd="./tools/dist_train.sh ${train_config} 1 ${train_options}" ;; mp) - case ${model_name} in - basicvsr_mp_bs2|basicvsr_mp_bs4) train_cmd="./tools/dist_train.sh ${train_config} 4 ${train_options}" ;; - *) train_cmd="./tools/dist_train.sh ${train_config} 8 ${train_options}" - esac - ;; + if [ ${model_name} = "basicvsr_bs2_fp32" ] || [ ${model_name} = "basicvsr_bs4_fp32" ]; then + train_cmd="./tools/dist_train.sh ${train_config} 4 ${train_options}" + else + train_cmd="./tools/dist_train.sh ${train_config} 8 ${train_options}" + fi + ;; *) echo "choose run_mode(sp or mp)"; exit 1; esac @@ -64,4 +66,4 @@ function _train(){ } _set_params $@ -_train \ No newline at end of file +_train diff --git a/OtherFrame/scripts/auto_run.sh b/OtherFrame/scripts/auto_run.sh index e171171736..2fa380b025 100644 --- a/OtherFrame/scripts/auto_run.sh +++ b/OtherFrame/scripts/auto_run.sh @@ -57,7 +57,7 @@ function set_env(){ -cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast gan_torch_fomm) +cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast gan_torch_models) cur_mxnet_list=() cur_tensorflow_list=() @@ -113,12 +113,21 @@ detec_torch_fast(){ cp models/SOLO/*fp32_8 ${TRAIN_LOG_DIR} } -gan_torch_fomm(){ +gan_torch_models(){ + # FOMM cur_model_path=${ROOT_DIR}/gan/PyTorch/fomm cd ${cur_model_path} bash run_PyTorch.sh cp ${cur_model_path}/logs/train_log/* ${TRAIN_LOG_DIR} cp ${cur_model_path}/*speed ${LOG_PATH_INDEX_DIR} + + # edvr basicvsr esrgan + cur_model_path=${ROOT_DIR}/gan/PyTorch/mmedting + cd ${cur_model_path} + bash run_PyTorch.sh + cp ${cur_model_path}/*speed ${LOG_PATH_INDEX_DIR} + cp ${cur_model_path}/*sp ${TRAIN_LOG_DIR} + cp ${cur_model_path}/*mp ${TRAIN_LOG_DIR} } set_env for model_name in ${cur_torch_list[@]}