Skip to content

Commit 6e06e1f

Browse files
authored
[PaddleNLP] add llm case:rf++ (#3062)
* add PaddleLLM * Revert "add PaddleLLM" This reverts commit fae1418. * add PaddleLLM * fix * update PaddleNLP to PaddleLLM * fix path * fix path for infer.sh * skip kill * update step for success * fix path * add PYTHONPATH * fix infer * fix end * update step * update model to 0.5B * revert model to 0.5B * test 0.5B * update cmd * add if [ == ppo ] || [ == grpo ] || [ == reinforce_plus_plus ]; then * update reward path * Update reinforce_plus_plus.sh for config
1 parent 0b75107 commit 6e06e1f

File tree

5 files changed

+154
-12
lines changed

5 files changed

+154
-12
lines changed

models_restruct/PaddleLLM/cases/llm^alignment^qwen.yaml

+34-1
Original file line numberDiff line numberDiff line change
@@ -33,4 +33,37 @@ case:
3333
-
3434
name: grpo_export_infer_ngpus2_bf16 # 2卡, sample预测 注意执行路径! ${save_steps}
3535
path: llm/
36-
cmd: bash ./infer.sh qwen 2 grpo 2
36+
cmd: bash ./infer.sh qwen 2 grpo 2
37+
-
38+
name: reinforce_plus_plus_training
39+
path: llm/alignment/ppo
40+
cmd: bash reinforce_plus_plus.sh qwen
41+
result:
42+
policy_loss:
43+
base: "compare_path"
44+
threshold: 0.1
45+
evaluation: "=" # = base值转成str进行按位对齐
46+
reward:
47+
base: "compare_path"
48+
threshold: 0.1
49+
evaluation: "-"
50+
values:
51+
base: "compare_path"
52+
threshold: 0.1
53+
evaluation: "-"
54+
kl_divergence:
55+
base: "compare_path"
56+
threshold: 0.1
57+
evaluation: "-"
58+
interval_samples_per_second:
59+
base: "compare_path"
60+
threshold: 0.1
61+
evaluation: "-"
62+
-
63+
name: reinforce_plus_plus_predict_dynamic_ngpus2_default_bf16 # 2卡动态图预测 ${save_steps}
64+
path: llm/
65+
cmd: bash ./predict.sh qwen 2 reinforce_plus_plus 2
66+
-
67+
name: reinforce_plus_plus_export_infer_ngpus2_bf16 # 2卡, sample预测 注意执行路径! ${save_steps}
68+
path: llm/
69+
cmd: bash ./infer.sh qwen 2 reinforce_plus_plus 2

models_restruct/PaddleLLM/train/grpo.sh

+13-9
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ echo "清理Checkpoints"
1313
rm -rf ../../checkpoints/${model_name}/grpo/* 2>/dev/null
1414

1515
if [[ ${model_name} == "qwen" ]]; then
16-
model_name_or_path="Qwen/Qwen2.5-7B-Instruct-1M"
16+
model_name_or_path="Qwen/Qwen2.5-1.5B"
1717
elif [[ ${model_name} == "llama" ]]; then
1818
model_name_or_path="meta-llama/Meta-Llama-3-8B"
1919
fi
@@ -49,44 +49,48 @@ export PYTHONPATH=$llm_path:$PYTHONPATH
4949

5050
# 4. 启动训练脚本
5151
echo "启动reward服务"
52+
cd reward
5253
python reward_server.py > reward_server.log 2>&1 &
54+
cd ..
5355
echo "开始训练:"
5456

5557
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_ppo.py ../../config/${model_name}/grpo_argument.json \
56-
--train_datasets "Jsonfile::ppo-kk/34567ppl/train.jsonl" \
57-
--eval_datasets "Jsonfile::ppo-kk/5ppl/test.jsonl" \
58+
--train_datasets "ppo-kk/34567ppl/train.jsonl" \
59+
--eval_datasets "ppo-kk/5ppl/test.jsonl" \
60+
--label_key tgt \
5861
--actor_model_name_or_path ${model_name_or_path} \
5962
--reward_model_name_or_path "" \
6063
--output_dir ${output_dir} \
6164
--max_steps ${steps} \
6265
--save_steps ${steps} \
63-
--tensor_parallel_degree 4 \
66+
--tensor_parallel_degree 2 \
6467
--per_device_prompt_batch_size 1 \
6568
--per_device_train_batch_size 4 \
6669
--max_length 1024 \
6770
--max_prompt_len 512 \
6871
--pipeline_parallel_degree 1 \
69-
--sharding_parallel_degree 2 \
72+
--sharding_parallel_degree 4 \
7073
--sharding "stage1" \
7174
--recompute 1 \
7275
${ext_args}
7376

7477
echo "热启"
7578
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_ppo.py ../../config/${model_name}/grpo_argument.json \
76-
--train_datasets "Jsonfile::ppo-kk/34567ppl/train.jsonl" \
77-
--eval_datasets "Jsonfile::ppo-kk/5ppl/test.jsonl" \
79+
--train_datasets "ppo-kk/34567ppl/train.jsonl" \
80+
--eval_datasets "ppo-kk/5ppl/test.jsonl" \
81+
--label_key tgt \
7882
--actor_model_name_or_path ${model_name_or_path} \
7983
--reward_model_name_or_path "" \
8084
--output_dir ${output_dir} \
8185
--max_steps 1 \
8286
--save_steps 11 \
83-
--tensor_parallel_degree 4 \
87+
--tensor_parallel_degree 2 \
8488
--per_device_prompt_batch_size 1 \
8589
--per_device_train_batch_size 4 \
8690
--max_length 1024 \
8791
--max_prompt_len 512 \
8892
--pipeline_parallel_degree 1 \
89-
--sharding_parallel_degree 2 \
93+
--sharding_parallel_degree 4 \
9094
--sharding "stage1" \
9195
--recompute 1 \
9296
${ext_args}

models_restruct/PaddleLLM/train/infer.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ step_name=${3:-"grpo"}
66
save_steps=${4:-2}
77

88
# 1.动转静模型路径
9-
if [ "$step_name" == "ppo" ] || [ "$step_name" == "grpo" ]; then
9+
if [ "$step_name" == "ppo" ] || [ "$step_name" == "grpo" ] || [ "$step_name" == "reinforce_plus_plus" ]; then
1010
model_name_or_path=./checkpoints/$model_name/${step_name}/policy/checkpoint-${save_steps}
1111
else
1212
model_name_or_path=./checkpoints/$model_name/${step_name}

models_restruct/PaddleLLM/train/predict.sh

+1-1
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ step_name=${3:-"grpo"}
66
save_steps=${4:-2}
77

88
# 1.设置模型路径
9-
if [ "$step_name" == "ppo" ] || [ "$step_name" == "grpo" ]; then
9+
if [ "$step_name" == "ppo" ] || [ "$step_name" == "grpo" ] || [ "$step_name" == "reinforce_plus_plus" ]; then
1010
model_name_or_path=./checkpoints/$model_name/${step_name}/policy/checkpoint-${save_steps}
1111
else
1212
model_name_or_path=./checkpoints/$model_name/${step_name}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
# work_path: PaddleLLM/llm/alignment/ppo
2+
# reinforce_plus_plus 训练
3+
model_name=$1
4+
ngpus=${2:-8}
5+
steps=${3:-2}
6+
ext_args=""
7+
8+
# 1. 模型准备
9+
echo "清理显存"
10+
# fuser -v /dev/nvidia* 2>/dev/null | awk '{for(i=1;i<=NF;i++) if ($i ~ /^[0-9]+$/) print $i}' | xargs kill -9 2>/dev/null
11+
sleep 3s
12+
echo "清理Checkpoints"
13+
rm -rf ../../checkpoints/${model_name}/reinforce_plus_plus/* 2>/dev/null
14+
15+
if [[ ${model_name} == "qwen" ]]; then
16+
model_name_or_path="Qwen/Qwen2.5-1.5B"
17+
elif [[ ${model_name} == "llama" ]]; then
18+
model_name_or_path="meta-llama/Meta-Llama-3-8B"
19+
fi
20+
21+
output_dir="../../checkpoints/${model_name}/reinforce_plus_plus" # 以llm为根目录
22+
23+
# 2. 数据准备
24+
if [ ! -d "ppo-kk" ]; then
25+
wget https://paddlenlp.bj.bcebos.com/datasets/examples/ppo-kk.tgz && tar zxf ppo-kk.tgz
26+
fi
27+
28+
# 3. 设置环境变量
29+
if [ $ngpus -eq 0 ]; then
30+
DEVICE="0"
31+
elif [ $ngpus -eq 1 ]; then
32+
DEVICE="1"
33+
elif [ $ngpus -eq 2 ]; then
34+
DEVICE="2,3"
35+
elif [ $ngpus -eq 4 ]; then
36+
DEVICE="4,5,6,7"
37+
elif [ $ngpus -eq 8 ]; then
38+
DEVICE="0,1,2,3,4,5,6,7"
39+
else
40+
echo "Unsupported number of GPUs"
41+
exit 1
42+
fi
43+
export CUDA_VISIBLE_DEVICES=${DEVICE}
44+
current_path=$(pwd)
45+
repo_path=${current_path%%PaddleLLM*}PaddleLLM
46+
llm_path=${repo_path}/llm
47+
export PYTHONPATH=$repo_path:$PYTHONPATH
48+
export PYTHONPATH=$llm_path:$PYTHONPATH
49+
50+
# 4. 启动训练脚本
51+
echo "启动reward服务"
52+
cd reward
53+
python reward_server.py > reward_server.log 2>&1 &
54+
cd ..
55+
echo "开始训练:"
56+
57+
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_ppo.py ../../config/${model_name}/grpo_argument.json \
58+
--train_datasets "ppo-kk/34567ppl/train.jsonl" \
59+
--eval_datasets "ppo-kk/5ppl/test.jsonl" \
60+
--label_key tgt \
61+
--rl_algorithm reinforce_plus_plus \
62+
--normalize_advantage 0 \
63+
--normalize_reward 1 \
64+
--actor_model_name_or_path ${model_name_or_path} \
65+
--reward_model_name_or_path "" \
66+
--output_dir ${output_dir} \
67+
--max_steps ${steps} \
68+
--save_steps ${steps} \
69+
--tensor_parallel_degree 2 \
70+
--per_device_prompt_batch_size 1 \
71+
--per_device_train_batch_size 8 \
72+
--max_length 1024 \
73+
--max_prompt_len 512 \
74+
--pipeline_parallel_degree 1 \
75+
--sharding_parallel_degree 4 \
76+
--sharding "stage1" \
77+
--recompute 1 \
78+
${ext_args}
79+
80+
echo "热启"
81+
python -u -m paddle.distributed.launch --devices "0,1,2,3,4,5,6,7" run_ppo.py ../../config/${model_name}/grpo_argument.json \
82+
--train_datasets "ppo-kk/34567ppl/train.jsonl" \
83+
--eval_datasets "ppo-kk/5ppl/test.jsonl" \
84+
--label_key tgt \
85+
--rl_algorithm reinforce_plus_plus \
86+
--normalize_advantage 0 \
87+
--normalize_reward 1 \
88+
--actor_model_name_or_path ${model_name_or_path} \
89+
--reward_model_name_or_path "" \
90+
--output_dir ${output_dir} \
91+
--max_steps 1 \
92+
--save_steps 11 \
93+
--tensor_parallel_degree 2 \
94+
--per_device_prompt_batch_size 1 \
95+
--per_device_train_batch_size 8 \
96+
--max_length 1024 \
97+
--max_prompt_len 512 \
98+
--pipeline_parallel_degree 1 \
99+
--sharding_parallel_degree 4 \
100+
--sharding "stage1" \
101+
--recompute 1 \
102+
${ext_args}
103+
104+
echo "kill reward 服务"
105+
pkill -9 -f reward_server.py

0 commit comments

Comments
 (0)