From 372ed5cc004cdd56b7790fa8d27dee32ac175379 Mon Sep 17 00:00:00 2001 From: liujie44 Date: Wed, 23 Apr 2025 17:11:11 +0800 Subject: [PATCH 1/3] [CI]Add timeout exception prompt for auto_parallel ci --- tools/auto_parallel/ci_auto_parallel.sh | 27 +++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh index 3b0183ae189bcf..3383d5e2aceea2 100644 --- a/tools/auto_parallel/ci_auto_parallel.sh +++ b/tools/auto_parallel/ci_auto_parallel.sh @@ -106,26 +106,49 @@ function execute_func_list(){ let global_total_count++ execute_num=1 while true; do - bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data + timeout 10m bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data result=$? if [ $result -eq 0 ]; then echo -e "\033[32m test success!" let success_count++ let global_success_count++ + elif [ $result -eq 1 ]; then + if [ $execute_num -eq 1 ]; then + echo -e "\033[31m first time execute failed, try again!" + let execute_num++ + continue + else + echo -e "\033[31m second time execute failed, exit!" + mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log + echo -e "\033[31m ${log_path}/$func_name_FAIL \033" + tail -15 ${log_path}/${func_name}_FAIL.log + let runtime_fail_count++ + global_runtime_fail_arr+=("$func_name") + fi elif [ $result -eq 2 ]; then echo -e "\033[31m verification failed!" let verification_fail_count++ global_verification_fail_arr+=("$func_name") elif [ $result -eq 250 ]; then if [ $execute_num -eq 1 ]; then - echo -e "\033[31m fist time execute failed, try again!" + echo -e "\033[31m first time execute failed, try again!" let execute_num++ continue else echo -e "\033[31m second time execute failed, exit!" + mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log + echo -e "\033[31m ${log_path}/$func_name_FAIL \033" + tail -15 ${log_path}/${func_name}_FAIL.log let exit_250_count++ global_exit_250_arr+=("$func_name") fi + elif [ $result -eq 124 ]; then + echo "\033[31m [failed-timeout] Test case execution was terminated after exceeding the 10m limit." + mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log + echo -e "\033[31m ${log_path}/$func_name_FAIL \033" + tail -15 ${log_path}/${func_name}_FAIL.log + let runtime_fail_count++ + global_runtime_fail_arr+=("$func_name") else echo "test failed!" mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log From 0d75e4b1f4301eeb48109be2b854cfb5722cfe7b Mon Sep 17 00:00:00 2001 From: liujie44 Date: Thu, 24 Apr 2025 10:20:07 +0800 Subject: [PATCH 2/3] fix codestyle --- tools/auto_parallel/ci_auto_parallel.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh index 3383d5e2aceea2..7aa8e687e72670 100644 --- a/tools/auto_parallel/ci_auto_parallel.sh +++ b/tools/auto_parallel/ci_auto_parallel.sh @@ -122,8 +122,8 @@ function execute_func_list(){ mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log echo -e "\033[31m ${log_path}/$func_name_FAIL \033" tail -15 ${log_path}/${func_name}_FAIL.log - let runtime_fail_count++ - global_runtime_fail_arr+=("$func_name") + let runtime_fail_count++ + global_runtime_fail_arr+=("$func_name") fi elif [ $result -eq 2 ]; then echo -e "\033[31m verification failed!" @@ -147,8 +147,8 @@ function execute_func_list(){ mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log echo -e "\033[31m ${log_path}/$func_name_FAIL \033" tail -15 ${log_path}/${func_name}_FAIL.log - let runtime_fail_count++ - global_runtime_fail_arr+=("$func_name") + let runtime_fail_count++ + global_runtime_fail_arr+=("$func_name") else echo "test failed!" mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log From cc1ffb26520a6c8917cfea134b6ea137937f3d6a Mon Sep 17 00:00:00 2001 From: liujie44 Date: Fri, 25 Apr 2025 16:38:26 +0800 Subject: [PATCH 3/3] add timeout for prepare_case --- tools/auto_parallel/ci_auto_parallel.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/auto_parallel/ci_auto_parallel.sh b/tools/auto_parallel/ci_auto_parallel.sh index 7aa8e687e72670..6f97cbcd9a7f26 100644 --- a/tools/auto_parallel/ci_auto_parallel.sh +++ b/tools/auto_parallel/ci_auto_parallel.sh @@ -233,7 +233,7 @@ if [[ ${#case_list[*]} -ne 0 ]];then let case_num++ elif [[ ${case} == "llama_auto" ]];then cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh - bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data + timeout 5m bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data execute_func_list $cmd llama_auto # There is no need to reinstall the related packages of `PaddleNLP` afterward. export FLAGS_install_deps=1 @@ -244,7 +244,7 @@ if [[ ${#case_list[*]} -ne 0 ]];then clean_file /workspace/PaddleNLP/llm/auto_parallel/llama elif [[ ${case} == "gpt-3_auto" ]];then cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh - bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data + timeout 5m bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data execute_func_list $cmd gpt-3_auto # there is no need to repeat the `gpt` download process later. export FLAGS_download_data="gpt ""$FLAGS_download_data" @@ -252,7 +252,7 @@ if [[ ${#case_list[*]} -ne 0 ]];then clean_file /workspace/PaddleNLP/llm/auto_parallel/gpt-3 elif [[ ${case} == "gpt-3_dygraph" ]];then cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh - bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data + timeout 5m bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data execute_func_list $cmd gpt-3_dygraph let case_num++ clean_file /workspace/PaddleNLP/llm