Skip to content

[CI]Add timeout exception prompt for auto_parallel ci #72428

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Apr 28, 2025
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 28 additions & 5 deletions tools/auto_parallel/ci_auto_parallel.sh
Original file line number Diff line number Diff line change
Expand Up @@ -106,26 +106,49 @@ function execute_func_list(){
let global_total_count++
execute_num=1
while true; do
bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data
timeout 10m bash $1 exec_case $func_name $FLAGS_install_deps $FLAGS_download_data
result=$?
if [ $result -eq 0 ]; then
echo -e "\033[32m test success!"
let success_count++
let global_success_count++
elif [ $result -eq 1 ]; then
if [ $execute_num -eq 1 ]; then
echo -e "\033[31m first time execute failed, try again!"
let execute_num++
continue
else
echo -e "\033[31m second time execute failed, exit!"
mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
echo -e "\033[31m ${log_path}/$func_name_FAIL \033"
tail -15 ${log_path}/${func_name}_FAIL.log
let runtime_fail_count++
global_runtime_fail_arr+=("$func_name")
fi
elif [ $result -eq 2 ]; then
echo -e "\033[31m verification failed!"
let verification_fail_count++
global_verification_fail_arr+=("$func_name")
elif [ $result -eq 250 ]; then
if [ $execute_num -eq 1 ]; then
echo -e "\033[31m fist time execute failed, try again!"
echo -e "\033[31m first time execute failed, try again!"
let execute_num++
continue
else
echo -e "\033[31m second time execute failed, exit!"
mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
echo -e "\033[31m ${log_path}/$func_name_FAIL \033"
tail -15 ${log_path}/${func_name}_FAIL.log
let exit_250_count++
global_exit_250_arr+=("$func_name")
fi
elif [ $result -eq 124 ]; then
echo "\033[31m [failed-timeout] Test case execution was terminated after exceeding the 10m limit."
mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
echo -e "\033[31m ${log_path}/$func_name_FAIL \033"
tail -15 ${log_path}/${func_name}_FAIL.log
let runtime_fail_count++
global_runtime_fail_arr+=("$func_name")
else
echo "test failed!"
mv ${log_path}/$func_name ${log_path}/${func_name}_FAIL.log
Expand Down Expand Up @@ -210,7 +233,7 @@ if [[ ${#case_list[*]} -ne 0 ]];then
let case_num++
elif [[ ${case} == "llama_auto" ]];then
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
timeout 5m bash $cmd prepare_case llama_case_list_auto $FLAGS_install_deps $FLAGS_download_data
execute_func_list $cmd llama_auto
# There is no need to reinstall the related packages of `PaddleNLP` afterward.
export FLAGS_install_deps=1
Expand All @@ -221,15 +244,15 @@ if [[ ${#case_list[*]} -ne 0 ]];then
clean_file /workspace/PaddleNLP/llm/auto_parallel/llama
elif [[ ${case} == "gpt-3_auto" ]];then
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_auto.sh
bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
timeout 5m bash $cmd prepare_case llm_gpt_case_list_auto $FLAGS_install_deps $FLAGS_download_data
execute_func_list $cmd gpt-3_auto
# there is no need to repeat the `gpt` download process later.
export FLAGS_download_data="gpt ""$FLAGS_download_data"
let case_num++
clean_file /workspace/PaddleNLP/llm/auto_parallel/gpt-3
elif [[ ${case} == "gpt-3_dygraph" ]];then
cmd=/workspace/PaddleNLP/scripts/distribute/ci_case_dy.sh
bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
timeout 5m bash $cmd prepare_case llm_gpt_case_list_dygraph $FLAGS_install_deps $FLAGS_download_data
execute_func_list $cmd gpt-3_dygraph
let case_num++
clean_file /workspace/PaddleNLP/llm
Expand Down
Loading