Skip to content

Commit 46fe570

Browse files
authored
Merge pull request #603 from ForFishes/add_yaml
Add yaml config for gpt model
2 parents 14e9f3f + 7593b7c commit 46fe570

File tree

15 files changed

+380
-494
lines changed

15 files changed

+380
-494
lines changed
+61
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
# 175 B
2+
PreTraining:
3+
device: gpu
4+
max_steps: 500000
5+
num_train_epochs: 1
6+
seed: 1024
7+
use_recompute: True
8+
batch_size:
9+
global_batch_size: 8
10+
local_batch_size:
11+
micro_batch_size: 1
12+
mix_precision:
13+
use_pure_fp16: True
14+
scale_loss: 32768.0
15+
logging_freq: 1
16+
eval_freq: 500
17+
eval_iters: 10
18+
dataset:
19+
input_dir: ./data
20+
split: '949,50,1'
21+
max_seq_len: 2048 #
22+
save_load:
23+
save_steps: 1000
24+
output_dir: ./output
25+
ckpt_dir:
26+
27+
Model:
28+
vocab_size: 51200
29+
hidden_size: 12288
30+
num_layers: 96
31+
num_attention_heads: 96
32+
ffn_hidden_size:
33+
hidden_dropout_prob: 0.1
34+
attention_probs_dropout_prob: 0.1
35+
max_position_embeddings: 1024
36+
type_vocab_size: 16
37+
initializer_range: 0.02
38+
39+
Distributed:
40+
dp_degree: 1
41+
mp_degree: 8
42+
pp_degree: 16
43+
sharding:
44+
sharding_degree: 1
45+
sharding_stage: 1
46+
sharding_offload: False
47+
48+
Optimizer:
49+
# name: Adam
50+
weight_decay: 0.01
51+
adam_beta1: 0.9
52+
adam_beta2: 0.999
53+
adam_epsilon: 1.0e-8
54+
lr:
55+
# name: consine
56+
decay_steps: 360000
57+
# max_steps: 500000
58+
warmup_rate: 0.01
59+
max_lr: 1.0e-5
60+
min_lr: 5.0e-5
61+
grad_clip: 1.0

examples/gpt/3D_parallelism/run.sh

+3-24
Original file line numberDiff line numberDiff line change
@@ -16,29 +16,8 @@ export PYTHONPATH=$PYTHONPATH:../../../
1616

1717
log_dir=dp2_pp2_mp2
1818
rm -rf $log_dir
19+
export FLAGS_enable_eager_mode=0
1920

20-
# 345M
21+
# 175B
2122
python -m paddle.distributed.launch --log_dir $log_dir --devices "0,1,2,3,4,5,6,7" run_pretrain.py \
22-
--input_dir "./data"\
23-
--output_dir "output"\
24-
--vocab_size 50304\
25-
--hidden_size 1024\
26-
--num_layers 24\
27-
--num_attention_heads 16\
28-
--max_seq_len 1024\
29-
--weight_decay 0.01\
30-
--grad_clip 1.0\
31-
--max_steps 500000\
32-
--save_steps 100000\
33-
--decay_steps 320000\
34-
--device gpu\
35-
--eval_freq 1000\
36-
--warmup_rate 0.01\
37-
--scale_loss 32768\
38-
--global_batch_size 32\
39-
--micro_batch_size 1\
40-
--dp_degree 1\
41-
--mp_degree 8\
42-
--pp_degree 1\
43-
--use_recompute True\
44-
--use_pure_fp16 False
23+
-c ./configs.yaml

examples/gpt/3D_parallelism/run_pretrain.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919
import time
2020
import sys
2121
sys.path.append("..")
22-
from args import parse_args
22+
from tools import parse_args, parse_yaml
2323

2424
import numpy as np
2525
import paddle
@@ -295,5 +295,5 @@ def do_train(args):
295295

296296

297297
if __name__ == "__main__":
298-
args = parse_args()
298+
args = parse_yaml(parse_args().config)
299299
do_train(args)

0 commit comments

Comments
 (0)