Skip to content

Bias correction for exponential moving average #17677

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 2 commits into from
May 28, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddle/fluid/API.spec
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss
paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'zero_init', 'name'], varargs=None, keywords=None, defaults=(0.999, False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.ExponentialMovingAverage.__init__ (ArgSpec(args=['self', 'decay', 'thres_steps', 'name'], varargs=None, keywords=None, defaults=(0.999, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.ExponentialMovingAverage.apply (ArgSpec(args=['self', 'executor', 'need_restore'], varargs=None, keywords=None, defaults=(True,)), ('document', '30f494752ac8921dc5835a63637f453a'))
paddle.fluid.optimizer.ExponentialMovingAverage.restore (ArgSpec(args=['self', 'executor'], varargs=None, keywords=None, defaults=None), ('document', '8c8a1791608b02a1ede53d6dd3a4fcec'))
paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
Expand Down
112 changes: 82 additions & 30 deletions python/paddle/fluid/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

from __future__ import print_function

import numpy as np
from collections import defaultdict
from functools import reduce

Expand Down Expand Up @@ -2175,19 +2176,41 @@ class ExponentialMovingAverage(object):

.. math::

\\text{EMA}_t = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t
\\text{EMA}_0 & = 0


The average results will be saved in temporary variables which can be
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM

applied to parameters of current model by calling `apply()` method. And
the `restore()` method is used to restore the parameters.
\\text{EMA}_t & = \\text{decay} * \\text{EMA}_{t-1} + (1 - \\text{decay}) * \\theta_t

The average results will be saved in temporary variables which are created
and maintained by the object, and can be applied to parameters of current
model by calling **apply()** method. And the **restore()** method is used to
restore the parameters.

**Bias correction**. All EMAs are initialized to :math:`0` and hence they will be
zero biased, which can be corrected by divided by a factor
:math:`(1 - \\text{decay}^t)` , i.e., the actual EMAs applied to parameters
when calling **apply()** method would be

.. math::

\\widehat{\\text{EMA}}_t = \\frac{\\text{EMA}_t}{1 - \\text{decay}^t}

**Decay rate scheduling**. A large decay rate very close to 1 would result
in that the averages move very slowly. And a better strategy is to set a
relative smaller decay rate in the very beginning. The argument **thres_steps**
allows users to pass a Variable to schedule the decay rate, in this case,
the actual decay rate becomes

.. math::

\\min(\\text{decay}, \\frac{1 + \\text{thres_steps}}{10 + \\text{thres_steps}})

Usually **thres_steps** can be the global training steps.


Args:
decay (float|Variable): The exponential decay rate. Can be scheduled like
learning rate.
zero_init (bool): Whether using zero to initialize EMA Variable. If set to
`True`, :math:`\\text{EMA}_0 = 0.0` else :math:`\\text{EMA}_0 = \\theta_0`.
decay (float): The exponential decay rate, usually close to 1, such as
0.999, 0.9999, ... .
thres_steps (Variable|None): If not `None`, schedule the decay rate.
name (str|None): An optional name prefix.


Expand All @@ -2204,25 +2227,35 @@ class ExponentialMovingAverage(object):
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
optimizer.minimize(cost)

ema = fluid.optimizer.ExponentialMovingAverage(0.99)
global_steps = fluid.layers.learning_rate_scheduler._decay_step_counter()
ema = fluid.optimizer.ExponentialMovingAverage(0.999, thres_steps=global_steps)

# pseudo code
for pass_id in range(args.pass_num):
for data in train_reader():
exe.run(fluid.default_main_program()...)


# usage 1
with ema.apply(exe):
for data in test_reader():
exe.run(inference_program...)

# usage 2
with ema.apply(exe, need_restore=False):
for data in test_reader():
exe.run(inference_program...)
...
ema.restore(exe)
"""

def __init__(self, decay=0.999, zero_init=False, name=None):
def __init__(self, decay=0.999, thres_steps=None, name=None):
self._decay = decay
self._zero_init = zero_init
self._thres_steps = thres_steps
self._name = name if name is not None else ''
self._decay_var = self._get_ema_decay()

self.params_tmps = []
for param in framework.default_main_program().global_block(
).all_parameters():
for param in default_main_program().global_block().all_parameters():
if param.do_model_average != False:
tmp = param.block.create_var(
name=unique_name.generate(".".join(
Expand All @@ -2232,22 +2265,23 @@ def __init__(self, decay=0.999, zero_init=False, name=None):
stop_gradient=True)
self.params_tmps.append((param, tmp))

startup_block = default_startup_program().global_block()
ema_vars = {}
for param, tmp in self.params_tmps:
with param.block.program._optimized_guard(
[param, tmp]), name_scope('moving_average'):
ema_vars[param.name] = self._append_ema_ops(startup_block,
param)
ema_vars[param.name] = self._append_ema_ops(param)

self.apply_program = Program()
block = self.apply_program.global_block()
with program_guard(main_program=self.apply_program):
decay_pow = self._get_decay_pow(block)
for param, tmp in self.params_tmps:
param = block._clone_variable(param)
tmp = block._clone_variable(tmp)
ema = block._clone_variable(ema_vars[param.name])
layers.assign(input=param, output=tmp)
# bias correction
ema = ema / (1.0 - decay_pow)
layers.assign(input=ema, output=param)

self.restore_program = Program()
Expand All @@ -2258,25 +2292,43 @@ def __init__(self, decay=0.999, zero_init=False, name=None):
param = block._clone_variable(param)
layers.assign(input=tmp, output=param)

def _append_ema_ops(self, startup_block, param):
def _get_ema_decay(self):
with default_main_program()._lr_schedule_guard():
decay_var = layers.tensor.create_global_var(
shape=[1],
value=self._decay,
dtype='float32',
persistable=True,
name="scheduled_ema_decay_rate")

if self._thres_steps is not None:
decay_t = (self._thres_steps + 1.0) / (self._thres_steps + 10.0)
with layers.control_flow.Switch() as switch:
with switch.case(decay_t < self._decay):
layers.tensor.assign(decay_t, decay_var)
with switch.default():
layers.tensor.assign(
np.array(
[self._decay], dtype=np.float32),
decay_var)
return decay_var

def _get_decay_pow(self, block):
global_steps = layers.learning_rate_scheduler._decay_step_counter()
decay_var = block._clone_variable(self._decay_var)
decay_pow_acc = layers.elementwise_pow(decay_var, global_steps + 1)
return decay_pow_acc

def _append_ema_ops(self, param):
param_ema = layers.create_global_var(
name=unique_name.generate(self._name + param.name + '_ema'),
shape=param.shape,
value=0.0,
dtype=param.dtype,
persistable=True)
# t = 0
if self._zero_init is not True:
startup_p_ema = startup_block._clone_variable(param_ema)
startup_p = startup_block.var(param.name)
startup_block.append_op(
type="assign",
inputs={"X": startup_p},
outputs={"Out": startup_p_ema})
# t > 0
ema_t = param_ema * self._decay - param * (self._decay - 1)
layers.assign(input=ema_t, output=param_ema)

ema_t = param_ema * self._decay_var + param * (1 - self._decay_var)
layers.assign(input=ema_t, output=param_ema)
return param_ema

@signature_safe_contextmanager
Expand Down