Skip to content

Commit 8a50352

Browse files
authored
Revert grad scale optimization pr (#50839)
* Revert "fixoptminizer _set_auxiliary_var bug (#50335)" This reverts commit c44005f. * Revert "refine optimizer create accumulators (#50188)" This reverts commit 244e754. * Revert "fix found_inf bug for custom optimizer (#50158)" This reverts commit 64573f9. * Revert "refine amp scaler found_inf (#49864)" This reverts commit 382e9a0. * fix code format * fix conflict
1 parent 09694f8 commit 8a50352

File tree

19 files changed

+101
-212
lines changed

19 files changed

+101
-212
lines changed

python/paddle/amp/grad_scaler.py

+15-37
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818

1919
import numpy as np
2020

21-
from paddle import _C_ops, _legacy_C_ops
21+
from paddle import _legacy_C_ops
2222
from paddle.fluid import core, in_dygraph_mode
2323
from paddle.fluid.data_feeder import check_type
2424
from paddle.fluid.dygraph import to_variable
@@ -228,16 +228,11 @@ def minimize(self, optimizer, *args, **kwargs):
228228

229229
optimize_ops, params_grads = (None, None)
230230

231-
if hasattr(optimizer, "_set_auxiliary_var"):
232-
optimizer._set_auxiliary_var('found_inf', self._found_inf)
233-
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
234-
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
231+
if self._found_inf:
232+
self._cache_founf_inf = True
235233
else:
236-
if self._found_inf:
237-
self._cache_founf_inf = True
238-
else:
239-
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
240-
self._cache_founf_inf = False
234+
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
235+
self._cache_founf_inf = False
241236

242237
if self._use_dynamic_loss_scaling:
243238
# uopdate the scale
@@ -335,9 +330,6 @@ def _unscale(self, optimizer):
335330
param_grads_fp16,
336331
self._temp_found_inf_fp16,
337332
)
338-
self._found_inf = _C_ops.bitwise_or(
339-
self._found_inf, self._temp_found_inf_fp16
340-
)
341333
if len(param_grads_bf16):
342334
_legacy_C_ops.check_finite_and_unscale(
343335
param_grads_bf16,
@@ -346,9 +338,6 @@ def _unscale(self, optimizer):
346338
param_grads_bf16,
347339
self._temp_found_inf_bf16,
348340
)
349-
self._found_inf = _C_ops.bitwise_or(
350-
self._found_inf, self._temp_found_inf_bf16
351-
)
352341
if len(param_grads_fp32):
353342
_legacy_C_ops.check_finite_and_unscale(
354343
param_grads_fp32,
@@ -357,9 +346,6 @@ def _unscale(self, optimizer):
357346
param_grads_fp32,
358347
self._temp_found_inf_fp32,
359348
)
360-
self._found_inf = _C_ops.bitwise_or(
361-
self._found_inf, self._temp_found_inf_fp32
362-
)
363349
else:
364350
if len(param_grads_fp16):
365351
_legacy_C_ops.check_finite_and_unscale(
@@ -368,29 +354,26 @@ def _unscale(self, optimizer):
368354
param_grads_fp16,
369355
self._temp_found_inf_fp16,
370356
)
371-
self._found_inf = _C_ops.bitwise_or(
372-
self._found_inf, self._temp_found_inf_fp16
373-
)
374357
if len(param_grads_bf16):
375358
_legacy_C_ops.check_finite_and_unscale(
376359
param_grads_bf16,
377360
self._scale,
378361
param_grads_bf16,
379362
self._temp_found_inf_bf16,
380363
)
381-
self._found_inf = _C_ops.bitwise_or(
382-
self._found_inf, self._temp_found_inf_bf16
383-
)
384364
if len(param_grads_fp32):
385365
_legacy_C_ops.check_finite_and_unscale(
386366
param_grads_fp32,
387367
self._scale,
388368
param_grads_fp32,
389369
self._temp_found_inf_fp32,
390370
)
391-
self._found_inf = _C_ops.bitwise_or(
392-
self._found_inf, self._temp_found_inf_fp32
393-
)
371+
372+
self._found_inf = (
373+
self._temp_found_inf_fp16
374+
or self._temp_found_inf_bf16
375+
or self._temp_found_inf_fp32
376+
)
394377

395378
optimizer_state["state"] = OptimizerState.UNSCALED
396379

@@ -778,16 +761,11 @@ def step(self, optimizer):
778761
if optimizer_state["state"] is OptimizerState.INIT:
779762
self._unscale(optimizer)
780763

781-
if hasattr(optimizer, "_set_auxiliary_var"):
782-
optimizer._set_auxiliary_var('found_inf', self._found_inf)
783-
optimizer.step()
784-
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
764+
if self._found_inf:
765+
self._cache_founf_inf = True
785766
else:
786-
if self._found_inf:
787-
self._cache_founf_inf = True
788-
else:
789-
optimizer.step()
790-
self._cache_founf_inf = False
767+
optimizer.step()
768+
self._cache_founf_inf = False
791769

792770
optimizer_state["state"] = OptimizerState.STEPPED
793771

python/paddle/distributed/fleet/meta_optimizers/ascend/ascend_optimizer.py

-4
Original file line numberDiff line numberDiff line change
@@ -236,10 +236,6 @@ def _get_input_varlist(self, program):
236236
ret_list.append(var)
237237
return ret_list
238238

239-
def _set_auxiliary_var(self, key, val):
240-
super()._set_auxiliary_var(key, val)
241-
self.inner_opt._set_auxiliary_var(key, val)
242-
243239
def minimize(
244240
self,
245241
loss,

python/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_gradscaler.py

+4-9
Original file line numberDiff line numberDiff line change
@@ -41,16 +41,11 @@ def minimize(self, optimizer, *args, **kwargs):
4141

4242
optimize_ops, params_grads = (None, None)
4343

44-
if hasattr(optimizer, "_set_auxiliary_var"):
45-
optimizer._set_auxiliary_var('found_inf', self._found_inf)
46-
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
47-
self._cache_founf_inf = optimizer._get_auxiliary_var('found_inf')
44+
if self._found_inf:
45+
self._cache_founf_inf = True
4846
else:
49-
if self._found_inf:
50-
self._cache_founf_inf = True
51-
else:
52-
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
53-
self._cache_founf_inf = False
47+
optimize_ops, params_grads = optimizer.minimize(*args, **kwargs)
48+
self._cache_founf_inf = False
5449

5550
if self._use_dynamic_loss_scaling:
5651
self._update()

python/paddle/distributed/fleet/meta_optimizers/meta_optimizer_base.py

-4
Original file line numberDiff line numberDiff line change
@@ -25,10 +25,6 @@ def __init__(self, optimizer):
2525
self.meta_optimizers_white_list = []
2626
self.meta_optimizers_black_list = []
2727

28-
def _set_auxiliary_var(self, key, val):
29-
super()._set_auxiliary_var(key, val)
30-
self.inner_opt._set_auxiliary_var(key, val)
31-
3228
def _set_basic_info(
3329
self, loss, role_maker, user_defined_optimizer, user_defined_strategy
3430
):

python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_optimizer_stage2.py

-4
Original file line numberDiff line numberDiff line change
@@ -203,10 +203,6 @@ def __init__(
203203
# Update optimizer parameters and adjust parameter storage and use according to rank.
204204
self._update_opt_status()
205205

206-
def _set_auxiliary_var(self, key, val):
207-
super()._set_auxiliary_var(key, val)
208-
self._optim._set_auxiliary_var(key, val)
209-
210206
@paddle.autograd.no_grad()
211207
def _sync_params_and_buffers(self):
212208
"""

python/paddle/distributed/fleet/meta_parallel/sharding/group_sharded_utils.py

+12-14
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@
1919
import numpy as np
2020

2121
import paddle
22-
from paddle import _C_ops, _legacy_C_ops
22+
from paddle import _legacy_C_ops
2323
from paddle.common_ops_import import dygraph_only
24-
from paddle.fluid import core
2524
from paddle.fluid.dygraph import to_variable
25+
from paddle.framework import core
2626
from paddle.nn import clip
2727

2828

@@ -270,37 +270,35 @@ def unscale_method(self, optimizer):
270270
param_grads_bfp16,
271271
temp_found_inf_bfp16,
272272
)
273-
self._found_inf = _C_ops.bitwise_or(
274-
self._found_inf, temp_found_inf_bfp16
275-
)
276273
if len(param_grads_fp16):
277274
_legacy_C_ops.check_finite_and_unscale(
278275
param_grads_fp16,
279276
self._scale,
280277
param_grads_fp16,
281278
temp_found_inf_fp16,
282279
)
283-
self._found_inf = _C_ops.bitwise_or(
284-
self._found_inf, temp_found_inf_fp16
285-
)
286280
if len(param_grads_fp32):
287281
_legacy_C_ops.check_finite_and_unscale(
288282
param_grads_fp32,
289283
self._scale,
290284
param_grads_fp32,
291285
temp_found_inf_fp32,
292286
)
293-
self._found_inf = _C_ops.bitwise_or(
294-
self._found_inf, temp_found_inf_fp32
295-
)
296287

297-
self._found_inf = self._found_inf.cast("int32")
288+
self._found_inf = (
289+
1
290+
if temp_found_inf_bfp16
291+
or temp_found_inf_fp16
292+
or temp_found_inf_fp32
293+
else 0
294+
)
295+
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
298296

299297
paddle.distributed.all_reduce(
300-
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
298+
is_found_inf, op=paddle.distributed.ReduceOp.SUM, group=None
301299
)
302300

303-
self._found_inf = self._found_inf.cast("bool")
301+
self._found_inf = is_found_inf.numpy()[0]
304302

305303
scaler._unscale = MethodType(unscale_method, scaler)
306304
return scaler

python/paddle/distributed/fleet/scaler.py

+5-10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
import numpy as np
1818

1919
import paddle
20-
from paddle import _C_ops, _legacy_C_ops
20+
from paddle import _legacy_C_ops
2121
from paddle.distributed import fleet
2222
from paddle.fluid.dygraph import to_variable
2323
from paddle.framework import core
@@ -73,29 +73,24 @@ def unscale_method(self, optimizer):
7373
param_grads_fp16,
7474
temp_found_inf_fp16,
7575
)
76-
self._found_inf = _C_ops.bitwise_or(
77-
self._found_inf, temp_found_inf_fp16
78-
)
7976
if len(param_grads_fp32):
8077
_legacy_C_ops.check_finite_and_unscale(
8178
param_grads_fp32,
8279
self._scale,
8380
param_grads_fp32,
8481
temp_found_inf_fp32,
8582
)
86-
self._found_inf = _C_ops.bitwise_or(
87-
self._found_inf, temp_found_inf_fp32
88-
)
8983

90-
self._found_inf = self._found_inf.cast("int32")
84+
self._found_inf = 1 if temp_found_inf_fp16 or temp_found_inf_fp32 else 0
85+
is_found_inf = paddle.to_tensor([self._found_inf], dtype="int32")
9186

9287
# TODO(shenliang03) Since dp allreduce in the optimizer is
9388
# after the gradscaler, check_finite needs to synchronize global
9489
# information. In the future, we should use check_group to speed.
9590
paddle.distributed.all_reduce(
96-
self._found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
91+
is_found_inf, op=paddle.distributed.ReduceOp.MAX, group=None
9792
)
98-
self._found_inf = self._found_inf.cast("bool")
93+
self._found_inf = is_found_inf.numpy()[0]
9994

10095
# Only data_parallel doesn't need to modify scaler
10196
fleet_env = fleet.fleet

python/paddle/fluid/optimizer.py

+6-12
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919

2020
import paddle
2121

22+
2223
from paddle.fluid.framework import (
2324
Program,
2425
Variable,
@@ -899,18 +900,11 @@ def _create_optimization_pass(self, parameters_and_grads):
899900
self._create_global_learning_rate()
900901

901902
if in_dygraph_mode():
902-
found_inf = self._get_auxiliary_var('found_inf')
903-
if found_inf:
904-
if isinstance(found_inf, core.eager.Tensor):
905-
self._set_auxiliary_var('found_inf', True)
906-
else:
907-
if isinstance(found_inf, core.eager.Tensor):
908-
self._set_auxiliary_var('found_inf', False)
909-
for param_and_grad in parameters_and_grads:
910-
if param_and_grad[1] is None:
911-
continue
912-
if param_and_grad[0].trainable is True:
913-
self._append_optimize_op(target_block, param_and_grad)
903+
for param_and_grad in parameters_and_grads:
904+
if param_and_grad[1] is None:
905+
continue
906+
if param_and_grad[0].trainable is True:
907+
self._append_optimize_op(target_block, param_and_grad)
914908
else:
915909
for param_and_grad in parameters_and_grads:
916910
if param_and_grad[1] is None:

python/paddle/incubate/optimizer/lookahead.py

-4
Original file line numberDiff line numberDiff line change
@@ -144,10 +144,6 @@ def __init__(self, inner_optimizer, alpha=0.5, k=5, name=None):
144144
self._global_step_var = None
145145
self._k_var = None
146146

147-
def _set_auxiliary_var(self, key, val):
148-
super()._set_auxiliary_var(key, val)
149-
self.inner_optimizer._set_auxiliary_var(key, val)
150-
151147
@framework.dygraph_only
152148
@imperative_base.no_grad
153149
def step(self):

python/paddle/optimizer/adadelta.py

-3
Original file line numberDiff line numberDiff line change
@@ -145,11 +145,8 @@ def _create_accumulators(self, block, parameters):
145145
parameters = parameters.get('params')
146146

147147
for p in parameters:
148-
if p.name in self._already_create_accumulater:
149-
continue
150148
self._add_accumulator(self._avg_squared_grad_acc_str, p)
151149
self._add_accumulator(self._avg_squared_update_acc_str, p)
152-
self._already_create_accumulater.add(p.name)
153150

154151
def _append_optimize_op(self, block, param_and_grad):
155152
if isinstance(param_and_grad, dict):

python/paddle/optimizer/adagrad.py

-3
Original file line numberDiff line numberDiff line change
@@ -139,14 +139,11 @@ def _create_accumulators(self, block, parameters):
139139
parameters = self._update_param_group(parameters)
140140

141141
for p in parameters:
142-
if p.name in self._already_create_accumulater:
143-
continue
144142
self._add_accumulator(
145143
self._moment_acc_str,
146144
p,
147145
fill_value=self.initial_accumulator_value,
148146
)
149-
self._already_create_accumulater.add(p.name)
150147

151148
def _append_optimize_op(self, block, param_and_grad):
152149
assert isinstance(block, framework.Block)

0 commit comments

Comments
 (0)