@@ -9,6 +9,7 @@ python_bin="/opt/conda/envs/helixfold/bin/python"
9
9
# python_bin="python3"
10
10
11
11
# export NCCL_DEBUG=INFO
12
+ # export LD_LIBRARY_PATH=/usr/local/cuda/compat:$LD_LIBRARY_PATH
12
13
export PYTHONPATH=$root_path :$PYTHONPATH
13
14
# export PADDLE_NODE_NUM=$PADDLE_TRAINERS_NUM
14
15
# export PADDLE_NODE_NUM=1
@@ -17,11 +18,11 @@ LDDT_SCORE_BIN="$root_path/tools/lddt"
17
18
chmod +x $TM_SCORE_BIN
18
19
chmod +x $LDDT_SCORE_BIN
19
20
20
- # Disable C++ enisum, using python enisum
21
- export FLAGS_new_einsum=0
21
+ # Enable C++ enisum instead of python enisum
22
+ export FLAGS_new_einsum=1
22
23
23
- # Enable bf16 optimization
24
- export FLAGS_use_autotune=1
24
+ # Enable/Disable bf16 optimization
25
+ export FLAGS_use_autotune=0
25
26
26
27
train_af2_single () {
27
28
start_step=0
@@ -37,6 +38,7 @@ train_af2_single() {
37
38
--start_step=${start_step} \
38
39
--train_step=${train_step} \
39
40
--precision=${precision} \
41
+ --amp_level=${amp_level} \
40
42
--num_workers 6 \
41
43
--seed 2022 \
42
44
--batch_size=$batch_size \
@@ -66,6 +68,7 @@ train_af2_distributed() {
66
68
--start_step=${start_step} \
67
69
--train_step=${train_step} \
68
70
--precision=${precision} \
71
+ --amp_level=${amp_level} \
69
72
--num_workers 6 \
70
73
--seed 2022 \
71
74
--batch_size=$batch_size \
@@ -95,6 +98,8 @@ mkdir -p debug_log debug_models
95
98
model_name=" initial"
96
99
precision=" bf16"
97
100
# precision="fp32"
101
+ # amp_level="O1"
102
+ amp_level=" O2"
98
103
log_step=" --log_step=20"
99
104
eval_step=" --eval_step=1000"
100
105
save_step=" --save_step=1000"
@@ -116,10 +121,13 @@ mkdir -p debug_log debug_models
116
121
model_name=" finetune"
117
122
precision=" bf16"
118
123
# precision="fp32"
124
+ # amp_level="O1"
125
+ amp_level=" O2"
119
126
log_step=" --log_step=20"
120
127
eval_step=" --eval_step=1000"
121
128
save_step=" --save_step=1000"
122
- # init_model="$root_path/data/pd_params/model_5.pdparams"
129
+ # init_model="$root_path/data/params/params_model_1.npz"
130
+ # init_model="$root_path/data/pd_params/model_1.pdparams"
123
131
train_af2_single
124
132
fi
125
133
}
@@ -139,6 +147,8 @@ mkdir -p debug_log debug_models
139
147
model_name=" initial"
140
148
precision=" bf16"
141
149
# precision="fp32"
150
+ # amp_level="O1"
151
+ amp_level=" O2"
142
152
log_step=" --log_step=20"
143
153
eval_step=" --eval_step=1000"
144
154
save_step=" --save_step=1000"
@@ -163,10 +173,13 @@ mkdir -p debug_log debug_models
163
173
model_name=" finetune"
164
174
precision=" bf16"
165
175
# precision="fp32"
176
+ # amp_level="O1"
177
+ amp_level=" O2"
166
178
log_step=" --log_step=20"
167
179
eval_step=" --eval_step=1000"
168
180
save_step=" --save_step=1000"
169
- # init_model="$root_path/data/pd_params/model_5.pdparams"
181
+ # init_model="$root_path/data/params/params_model_1.npz"
182
+ # init_model="$root_path/data/pd_params/model_1.pdparams"
170
183
train_af2_distributed
171
184
fi
172
185
}
@@ -186,6 +199,8 @@ mkdir -p debug_log debug_models
186
199
model_name=" initial"
187
200
precision=" bf16"
188
201
# precision="fp32"
202
+ # amp_level="O1"
203
+ amp_level=" O2"
189
204
log_step=" --log_step=20"
190
205
eval_step=" --eval_step=1000"
191
206
save_step=" --save_step=1000"
@@ -210,10 +225,13 @@ mkdir -p debug_log debug_models
210
225
model_name=" finetune"
211
226
precision=" bf16"
212
227
# precision="fp32"
228
+ # amp_level="O1"
229
+ amp_level=" O2"
213
230
log_step=" --log_step=20"
214
231
eval_step=" --eval_step=1000"
215
232
save_step=" --save_step=1000"
216
- # init_model="$root_path/data/pd_params/model_5.pdparams"
233
+ # init_model="$root_path/data/params/params_model_1.npz"
234
+ # init_model="$root_path/data/pd_params/model_1.pdparams"
217
235
train_af2_distributed
218
236
fi
219
237
}
@@ -232,6 +250,8 @@ mkdir -p debug_log debug_models
232
250
model_name=" initial"
233
251
precision=" bf16"
234
252
# precision="fp32"
253
+ # amp_level="O1"
254
+ amp_level=" O2"
235
255
log_step=" --log_step=20"
236
256
eval_step=" --eval_step=1000"
237
257
save_step=" --save_step=1000"
@@ -253,6 +273,8 @@ mkdir -p debug_log debug_models
253
273
model_name=" initial"
254
274
precision=" bf16"
255
275
# precision="fp32"
276
+ # amp_level="O1"
277
+ amp_level=" O2"
256
278
log_step=" --log_step=20"
257
279
eval_step=" --eval_step=1000"
258
280
save_step=" --save_step=1000"
@@ -274,6 +296,8 @@ mkdir -p debug_log debug_models
274
296
model_name=" initial"
275
297
precision=" bf16"
276
298
# precision="fp32"
299
+ # amp_level="O1"
300
+ amp_level=" O2"
277
301
log_step=" --log_step=20"
278
302
eval_step=" --eval_step=1000"
279
303
save_step=" --save_step=1000"
0 commit comments