Skip to content

【hold 】Test gan 0105 #1254

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions OtherFrame/gan/PyTorch/fomm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,20 +40,15 @@ bash run_PyTorch.sh; # 创建容器,在该标准环境中测试模型
ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7";
docker pull ${ImageName}

#<<<<<<< gan_benchmark
#run_cmd="cd /workspace;
# cp /workspace/scripts/PrepareEnv.sh ./;
# bash PrepareEnv.sh;
# cd /workspace/first-order-model/;
run_cmd="cp /workspace/scripts/PrepareEnv.sh ./;
bash PrepareEnv.sh;
cd /workspace/models/fomm;
cp /workspace/scripts/run_benchmark.sh ./;
cp /workspace/scripts/analysis_log.py ./;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm_sp_bs8 sp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm_sp_bs16 sp fp32 16 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm_mp_bs32 mp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm_mp_bs64 mp fp32 16 300 4;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM_sp_bs8 sp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM_sp_bs16 sp fp32 16 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM_mp_bs32 mp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM_mp_bs64 mp fp32 16 300 4;
"

nvidia-docker run --name test_torch_gan -i \
Expand All @@ -67,12 +62,12 @@ nvidia-docker rm test_torch_gan

## 输出

执行完成后,在当前目录会产出分割模型训练性能数据的文件,比如`fomm_sp_bs8_fp32_1_speed`等文件,内容如下所示。
执行完成后,在当前目录会产出分割模型训练性能数据的文件,比如`FOMM_sp_bs8_fp32_1_speed`等文件,内容如下所示。

```bash
{
"log_file": "/workspace/models/fomm/fomm_sp_bs8_fp32_1", \ # log 目录,创建规范见PrepareEnv.sh
"model_name": "fomm_sp_bs8", \ # 模型case名,创建规范:repoName_模型名_bs${bs_item}_${fp_item}
"log_file": "/workspace/models/fomm/FOMM_sp_bs8_fp32_1", \ # log 目录,创建规范见PrepareEnv.sh
"model_name": "FOMM_sp_bs8", \ # 模型case名,创建规范:repoName_模型名_bs${bs_item}_${fp_item}
"mission_name": "图像生成", \ # 模型case所属任务名称,具体可参考scripts/config.ini
"direction_id": 0, \ # 模型case所属方向id,0:CV|1:NLP|2:Rec 具体可参考benchmark/scripts/config.ini
"run_mode": "sp", \ # 单卡:sp|多卡:mp
Expand Down
24 changes: 13 additions & 11 deletions OtherFrame/gan/PyTorch/fomm/run_PyTorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,27 +3,29 @@
ImageName="registry.baidubce.com/paddlepaddle/paddle:2.1.2-gpu-cuda10.2-cudnn7";
docker pull ${ImageName}

#<<<<<<< gan_benchmark
#run_cmd="cd /workspace/;
# cp /workspace/scripts/PrepareEnv.sh ./;
# bash PrepareEnv.sh;
# cd /workspace/first-order-model;

run_cmd="cp /workspace/scripts/PrepareEnv.sh ./;
bash PrepareEnv.sh;
cd /workspace/models/fomm;
cp /workspace/scripts/run_benchmark.sh ./;
cp /workspace/scripts/analysis_log.py ./;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm sp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh fomm sp fp32 16 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm mp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh fomm mp fp32 16 300 4;
sed -i '/set\ -xe/d' benchmark/run_benchmark.sh
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM sp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh FOMM sp fp32 16 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM mp fp32 8 300 4;
CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh FOMM mp fp32 16 300 4;
"

nvidia-docker run --name test_torch_gan -it \
nvidia-docker run --name test_torch_gan -i \
--net=host \
--shm-size=128g \
-v $PWD:/workspace \
-v /ssd3:/ssd3 \
-v /ssd2:/ssd2 \
-e "ALL_PATH=${all_path}" \
-v "BENCHMARK_ROOT=/workspace" \
-e "http_proxy=${http_proxy}" \
-e "https_proxy=${http_proxy}" \
-e "no_proxy=bcebos.com" \
${ImageName} /bin/bash -c "${run_cmd}"

nvidia-docker stop test_torch_gan
Expand Down
8 changes: 2 additions & 6 deletions OtherFrame/gan/PyTorch/fomm/scripts/PrepareEnv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@ echo "*******prepare benchmark***********"

################################# 创建一些log目录,如:
export BENCHMARK_ROOT=/workspace
log_date=`date "+%Y.%m%d.%H%M%S"`
frame=pytorch1.0.0
cuda_version=10.2
save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/
save_log_dir=${BENCHMARK_ROOT}/logs/

if [[ -d ${save_log_dir} ]]; then
rm -rf ${save_log_dir}
Expand All @@ -31,9 +28,8 @@ export PATH=/workspace/run_env:${PATH}
pip install -U pip
echo `pip --version`

git clone https://github.com/lzzyzlbb/first-order-model
cd /workspace/models/fomm
git checkout add_log
cd first-order-model
pip install -r requirements.txt
imageio_download_bin ffmpeg

Expand Down
2 changes: 1 addition & 1 deletion OtherFrame/gan/PyTorch/fomm/scripts/analysis_log.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ def analyze(model_name, log_file, res_log_file):
total_time = 0
for i in range(skip_num, len(time_res)):
total_time += float(time_res[i])
ips = total_time / (len(time_res) - skip_num)
ips = round(total_time / (len(time_res) - skip_num), 3)

info = {"log_file": log_file, "model_name": model_name, "mission_name": "图像生成",
"direction_id": 0, "run_mode": run_mode, "index": 1, "gpu_num": gpu_num,
Expand Down
9 changes: 5 additions & 4 deletions OtherFrame/gan/PyTorch/fomm/scripts/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@ set -xe

# Test training benchmark for a model.

# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_name} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers}
# Usage: CUDA_VISIBLE_DEVICES=xxx bash run_benchmark.sh ${model_item} ${run_mode} ${fp_item} ${bs_item} ${max_iter} ${num_workers}

function _set_params(){
model_name=${1:-"model_name"}
model_item=${1:-"model_item"}
run_mode=${2:-"sp"} # sp or mp
fp_item=${3:-"fp32"} # fp32 or fp16
batch_size=${4:-"2"}
Expand All @@ -17,8 +17,9 @@ function _set_params(){
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
res_log_file=${run_log_path}/${model_name}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed
log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}
res_log_file=${run_log_path}/${model_item}_${run_mode}_bs${batch_size}_${fp_item}_${num_gpu_devices}_speed
model_name=${model_item}_bs${batch_size}_${fp_item}
}

function _analysis_log(){
Expand Down
33 changes: 20 additions & 13 deletions OtherFrame/gan/PyTorch/mmedting/run_PyTorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,25 +9,32 @@ run_cmd="cp /workspace/scripts/PrepareEnv.sh ./;
cp -r /workspace/mmedi_benchmark_configs ./;
cp /workspace/scripts/run_benchmark.sh ./;
cp /workspace/scripts/analysis_log.py ./;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs32 sp fp32 32 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_sp_bs64 sp fp32 64 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs32 mp fp32 32 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_mp_bs64 mp fp32 64 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs4 sp fp32 4 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_sp_bs64 sp fp32 64 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs4 mp fp32 4 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_mp_bs64 mp fp32 64 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs2 sp fp32 2 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_sp_bs4 sp fp32 4 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs2 mp fp32 2 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_mp_bs4 mp fp32 4 300 4;
sed -i '/set\ -xe/d' run_benchmark.sh
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_bs32_fp32 sp fp32 32 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_bs32_fp32 mp fp32 32 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_bs4_fp32 sp fp32 4 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_bs4_fp32 mp fp32 4 300 3;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_bs2_fp32 sp fp32 2 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh basicvsr_bs4_fp32 sp fp32 4 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_bs2_fp32 mp fp32 2 300 4;
PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3 bash run_benchmark.sh basicvsr_bs4_fp32 mp fp32 4 300 4;
"
#PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh esrgan_bs64_fp32 sp fp32 64 300 4;
#PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh esrgan_bs64_fp32 mp fp32 64 300 4;
#PORT=23335 CUDA_VISIBLE_DEVICES=0 bash run_benchmark.sh edvr_bs64_fp32 sp fp32 64 300 3;
#PORT=23335 CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash run_benchmark.sh edvr_bs64_fp32 mp fp32 64 300 3;

nvidia-docker run --name test_torch_gan -i \
--net=host \
--shm-size=128g \
-v $PWD:/workspace \
-v /ssd2:/ssd2 \
-e "ALL_PATH=${all_path}" \
-v "BENCHMARK_ROOT=/workspace" \
-e "http_proxy=${http_proxy}" \
-e "https_proxy=${http_proxy}" \
-e "no_proxy=bcebos.com" \
${ImageName} /bin/bash -c "${run_cmd}"

nvidia-docker stop test_torch_gan
nvidia-docker rm test_torch_gan
nvidia-docker rm test_torch_gan
5 changes: 1 addition & 4 deletions OtherFrame/gan/PyTorch/mmedting/scripts/PrepareEnv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,7 @@ echo "*******prepare benchmark***********"

################################# 创建一些log目录,如:
export BENCHMARK_ROOT=/workspace
log_date=`date "+%Y.%m%d.%H%M%S"`
frame=pytorch1.9.0
cuda_version=10.2
save_log_dir=${BENCHMARK_ROOT}/logs/${frame}_${log_date}_${cuda_version}/
save_log_dir=${BENCHMARK_ROOT}/logs/

if [[ -d ${save_log_dir} ]]; then
rm -rf ${save_log_dir}
Expand Down
22 changes: 12 additions & 10 deletions OtherFrame/gan/PyTorch/mmedting/scripts/run_benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ function _set_params(){
device=${CUDA_VISIBLE_DEVICES//,/ }
arr=(${device})
num_gpu_devices=${#arr[*]}
log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices}
res_log_file=${run_log_path}/${model_name}_${fp_item}_${num_gpu_devices}_speed
log_file=${run_log_path}/${model_name}_${num_gpu_devices}_${run_mode}
res_log_file=${run_log_path}/${model_name}_${num_gpu_devices}_${run_mode}_speed
}

function _analysis_log(){
Expand All @@ -30,18 +30,20 @@ function _analysis_log(){
function _train(){
echo "Train ${model_name} on ${num_gpu_devices} GPUs"
echo "current CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES, gpus=$num_gpu_devices, batch_size=$batch_size"

train_config="mmedi_benchmark_configs/${model_name}.py"


train_config="mmedi_benchmark_configs/${model_name%%_*}_${run_mode}_bs${batch_size}.py"
train_options="--no-validate "

case ${run_mode} in
sp) train_cmd="./tools/dist_train.sh ${train_config} 1 ${train_options}" ;;
mp)
case ${model_name} in
basicvsr_mp_bs2|basicvsr_mp_bs4) train_cmd="./tools/dist_train.sh ${train_config} 4 ${train_options}" ;;
*) train_cmd="./tools/dist_train.sh ${train_config} 8 ${train_options}"
esac
;;
if [ ${model_name} = "basicvsr_bs2_fp32" ] || [ ${model_name} = "basicvsr_bs4_fp32" ]; then
train_cmd="./tools/dist_train.sh ${train_config} 4 ${train_options}"
else
train_cmd="./tools/dist_train.sh ${train_config} 8 ${train_options}"
fi
;;
*) echo "choose run_mode(sp or mp)"; exit 1;
esac

Expand All @@ -64,4 +66,4 @@ function _train(){
}

_set_params $@
_train
_train
18 changes: 17 additions & 1 deletion OtherFrame/scripts/auto_run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ function set_env(){



cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast)
cur_torch_list=(clas_model_torch seg_model_torch speech_model_torch detec_torch_jde-fairmot detec_torch_fast gan_torch_models)
cur_mxnet_list=()
cur_tensorflow_list=()

Expand Down Expand Up @@ -113,6 +113,22 @@ detec_torch_fast(){
cp models/SOLO/*fp32_8 ${TRAIN_LOG_DIR}
}

gan_torch_models(){
# FOMM
cur_model_path=${ROOT_DIR}/gan/PyTorch/fomm
cd ${cur_model_path}
bash run_PyTorch.sh
cp ${cur_model_path}/logs/train_log/* ${TRAIN_LOG_DIR}
cp ${cur_model_path}/*speed ${LOG_PATH_INDEX_DIR}

# edvr basicvsr esrgan
cur_model_path=${ROOT_DIR}/gan/PyTorch/mmedting
cd ${cur_model_path}
bash run_PyTorch.sh
cp ${cur_model_path}/*speed ${LOG_PATH_INDEX_DIR}
cp ${cur_model_path}/*sp ${TRAIN_LOG_DIR}
cp ${cur_model_path}/*mp ${TRAIN_LOG_DIR}
}
set_env
for model_name in ${cur_torch_list[@]}
do
Expand Down