Skip to content

Commit 250e72d

Browse files
wangxicodinggongweibao
authored andcommitted
Fix DGC algorithm flow to make it the same as paper (#20758)
1 parent ba45dce commit 250e72d

14 files changed

+512
-42
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <string>
16+
17+
#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
18+
19+
namespace paddle {
20+
namespace operators {
21+
22+
class DGCMomentumOp : public MomentumOp {
23+
public:
24+
using MomentumOp::MomentumOp;
25+
26+
protected:
27+
void InferShape(framework::InferShapeContext* ctx) const override {
28+
PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
29+
"current_step should be set.");
30+
return MomentumOp::InferShape(ctx);
31+
}
32+
33+
framework::OpKernelType GetKernelTypeForVar(
34+
const std::string& var_name, const framework::Tensor& tensor,
35+
const framework::OpKernelType& expected_kernel_type) const override {
36+
if (var_name == "current_step") {
37+
VLOG(10) << "var_name:" << var_name << " need not to transform";
38+
return expected_kernel_type;
39+
}
40+
41+
return framework::OperatorWithKernel::GetKernelTypeForVar(
42+
var_name, tensor, expected_kernel_type);
43+
}
44+
};
45+
46+
class DGCMomentumOpMaker : public MomentumOpMaker {
47+
public:
48+
void Make() override {
49+
AddInput("current_step", "(Tensor) Current step.");
50+
AddAttr<float>("rampup_begin_step",
51+
"(float, -1.0)"
52+
"The period when begin DGC.")
53+
.SetDefault(-1.0);
54+
55+
return MomentumOpMaker::Make();
56+
}
57+
};
58+
59+
} // namespace operators
60+
} // namespace paddle
61+
62+
namespace ops = paddle::operators;
63+
REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum, ops::DGCMomentumOp,
64+
ops::DGCMomentumOpMaker);
65+
66+
REGISTER_OP_CPU_KERNEL(
67+
dgc_momentum,
68+
ops::DGCMomentumKernel<paddle::platform::CPUDeviceContext, float>);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
16+
17+
namespace ops = paddle::operators;
18+
REGISTER_OP_CUDA_KERNEL(
19+
dgc_momentum,
20+
ops::DGCMomentumKernel<paddle::platform::CUDADeviceContext, float>);
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include <memory>
18+
19+
#include "paddle/fluid/operators/optimizers/momentum_op.h"
20+
#include "paddle/fluid/operators/optimizers/sgd_op.h"
21+
22+
namespace paddle {
23+
namespace operators {
24+
25+
template <typename DeviceContext, typename T>
26+
class DGCMomentumKernel : public framework::OpKernel<T> {
27+
public:
28+
DGCMomentumKernel()
29+
: _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()),
30+
_sgd_op_kernel(new SGDOpKernel<DeviceContext, T>()) {}
31+
32+
void Compute(const framework::ExecutionContext& context) const override {
33+
auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
34+
if (static_cast<int>(rampup_begin_step) < 0) {
35+
return;
36+
}
37+
38+
auto current_step_tensor = context.Input<framework::Tensor>("current_step");
39+
auto* current_step = current_step_tensor->data<T>();
40+
41+
VLOG(10) << "current_step:" << *current_step
42+
<< ", rampup_begin_step:" << rampup_begin_step;
43+
44+
if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
45+
VLOG(10) << " so use momentum optimizer";
46+
return _momentum_op_kernel->Compute(context);
47+
}
48+
49+
VLOG(10) << " so use sgd optimizer";
50+
return _sgd_op_kernel->Compute(context);
51+
}
52+
53+
private:
54+
std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
55+
std::unique_ptr<SGDOpKernel<DeviceContext, T>> _sgd_op_kernel;
56+
};
57+
58+
} // namespace operators
59+
} // namespace paddle

paddle/fluid/operators/optimizers/momentum_op.cc

+28-30
Original file line numberDiff line numberDiff line change
@@ -37,36 +37,34 @@ class MomentumOpInferVarType : public framework::VarTypeInference {
3737
}
3838
};
3939

40-
class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
41-
public:
42-
void Make() override {
43-
AddInput("Param",
44-
"(Tensor, default Tensor<float>) "
45-
"Input parameter that has to be updated");
46-
AddInput("Grad",
47-
"(Tensor, default Tensor<float>) "
48-
"Input gradient of the parameter");
49-
AddInput("Velocity",
50-
"(Tensor, default Tensor<float>) "
51-
"Input velocity (corresponding to the parameter) "
52-
"that has to be updated");
53-
AddInput("LearningRate",
54-
"(Tensor, default Tensor<float>) "
55-
"Input learning rate");
40+
void MomentumOpMaker::Make() {
41+
AddInput("Param",
42+
"(Tensor, default Tensor<float>) "
43+
"Input parameter that has to be updated");
44+
AddInput("Grad",
45+
"(Tensor, default Tensor<float>) "
46+
"Input gradient of the parameter");
47+
AddInput("Velocity",
48+
"(Tensor, default Tensor<float>) "
49+
"Input velocity (corresponding to the parameter) "
50+
"that has to be updated");
51+
AddInput("LearningRate",
52+
"(Tensor, default Tensor<float>) "
53+
"Input learning rate");
5654

57-
AddOutput("ParamOut",
58-
"(Tensor) This output is updated parameter. "
59-
"It shared memory with Input(Param).");
60-
AddOutput("VelocityOut",
61-
"(Tensor) This output is updated velocity. "
62-
"It shared memory with Input(Velocity).");
55+
AddOutput("ParamOut",
56+
"(Tensor) This output is updated parameter. "
57+
"It shared memory with Input(Param).");
58+
AddOutput("VelocityOut",
59+
"(Tensor) This output is updated velocity. "
60+
"It shared memory with Input(Velocity).");
6361

64-
AddAttr<float>("mu", "(float) Momentum coefficient");
65-
AddAttr<bool>("use_nesterov",
66-
"(bool, default false) "
67-
"Use Nesterov Momentum")
68-
.SetDefault(false);
69-
AddComment(R"DOC(
62+
AddAttr<float>("mu", "(float) Momentum coefficient");
63+
AddAttr<bool>("use_nesterov",
64+
"(bool, default false) "
65+
"Use Nesterov Momentum")
66+
.SetDefault(false);
67+
AddComment(R"DOC(
7068
Momentum Optimizer.
7169
7270
This optimizer has a flag for Nestrov Momentum.
@@ -81,8 +79,8 @@ else: \\
8179
$$
8280
8381
)DOC");
84-
}
85-
};
82+
}
83+
8684
} // namespace operators
8785
} // namespace paddle
8886

paddle/fluid/operators/optimizers/momentum_op.h

+5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,11 @@ using framework::SelectedRows;
2929
struct NoNesterov;
3030
struct UseNesterov;
3131

32+
class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
33+
public:
34+
void Make() override;
35+
};
36+
3237
class MomentumOp : public framework::OperatorWithKernel {
3338
public:
3439
using framework::OperatorWithKernel::OperatorWithKernel;

paddle/fluid/operators/optimizers/sgd_op.cc

+3-1
Original file line numberDiff line numberDiff line change
@@ -110,4 +110,6 @@ This operator implements one step of the stochastic gradient descent algorithm.
110110
namespace ops = paddle::operators;
111111
REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
112112
paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
113-
REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
113+
REGISTER_OP_CPU_KERNEL(
114+
sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
115+
ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);

paddle/fluid/operators/optimizers/sgd_op.cu

+6-4
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
5353
} // namespace
5454

5555
template <typename T>
56-
class SGDOpCUDAKernel : public framework::OpKernel<T> {
56+
class SGDOpKernel<platform::CUDADeviceContext, T>
57+
: public framework::OpKernel<T> {
5758
public:
5859
void Compute(const framework::ExecutionContext& ctx) const override {
5960
const auto* param_var = ctx.InputVar("Param");
@@ -123,6 +124,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
123124

124125
namespace ops = paddle::operators;
125126
namespace plat = paddle::platform;
126-
REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
127-
ops::SGDOpCUDAKernel<double>,
128-
ops::SGDOpCUDAKernel<plat::float16>);
127+
REGISTER_OP_CUDA_KERNEL(
128+
sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
129+
ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>,
130+
ops::SGDOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);

paddle/fluid/operators/optimizers/sgd_op.h

+8-1
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,15 @@ limitations under the License. */
2121
namespace paddle {
2222
namespace operators {
2323

24-
template <typename T>
24+
template <typename DeviceContext, typename T>
2525
class SGDOpKernel : public framework::OpKernel<T> {
26+
public:
27+
void Compute(const framework::ExecutionContext &ctx) const override;
28+
};
29+
30+
template <typename T>
31+
class SGDOpKernel<platform::CPUDeviceContext, T>
32+
: public framework::OpKernel<T> {
2633
public:
2734
void Compute(const framework::ExecutionContext &ctx) const override {
2835
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");

python/paddle/fluid/optimizer.py

+42-5
Original file line numberDiff line numberDiff line change
@@ -959,6 +959,47 @@ def __init__(self,
959959
super(DGCMomentumOptimizer, self).__init__(
960960
learning_rate, momentum, use_nesterov, regularization, name)
961961

962+
def _is_use_dgc(self, param_var, grad_var):
963+
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
964+
if var_numel < 16384 or \
965+
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
966+
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
967+
param_var.dtype != core.VarDesc.VarType.FP32 :
968+
return False
969+
return True
970+
971+
def _append_optimize_op(self, block, param_and_grad):
972+
assert isinstance(block, framework.Block)
973+
974+
if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]):
975+
return super(DGCMomentumOptimizer, self)._append_optimize_op(
976+
block, param_and_grad)
977+
978+
velocity_acc = self._get_accumulator(self._velocity_acc_str,
979+
param_and_grad[0])
980+
# create the dgc momentum optimize op
981+
dgc_momentum_op = block.append_op(
982+
type="dgc_momentum",
983+
inputs={
984+
"Param": param_and_grad[0],
985+
"Grad": param_and_grad[1],
986+
"Velocity": velocity_acc,
987+
"LearningRate": self._create_param_lr(param_and_grad),
988+
"current_step": self._global_step_var,
989+
},
990+
outputs={
991+
"ParamOut": param_and_grad[0],
992+
"VelocityOut": velocity_acc
993+
},
994+
attrs={
995+
"mu": self._momentum,
996+
"use_nesterov": self._use_nesterov,
997+
"rampup_begin_step": float(self._rampup_begin_step)
998+
},
999+
stop_gradient=True)
1000+
1001+
return dgc_momentum_op
1002+
9621003
def _add_auto_increment_var(self, counter_name, begin, step=1):
9631004
helper = LayerHelper('global_step_counter')
9641005
counter, is_new_var = helper.create_or_get_global_variable(
@@ -997,11 +1038,7 @@ def _append_dgc_ops(self, param_and_grads):
9971038
force_cpu=True)
9981039

9991040
for param_var, grad_var in param_and_grads:
1000-
var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
1001-
if var_numel < 16384 or \
1002-
param_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
1003-
grad_var.type == core.VarDesc.VarType.SELECTED_ROWS or \
1004-
param_var.dtype != core.VarDesc.VarType.FP32 :
1041+
if not self._is_use_dgc(param_var, grad_var):
10051042
continue
10061043

10071044
u_var = tensor.create_global_var(

python/paddle/fluid/tests/unittests/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,8 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
1010
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
1111
#remove distribute unittests.
1212
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
13+
list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
14+
list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
1315
list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
1416
list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
1517
list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
@@ -248,6 +250,8 @@ if(WITH_DISTRIBUTE)
248250
py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
249251
if(WITH_DGC)
250252
py_test_modules(test_dgc_op MODULES test_dgc_op)
253+
py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
254+
py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
251255
endif()
252256
if(NOT APPLE)
253257
bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)

python/paddle/fluid/tests/unittests/dist_mnist.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
9898
opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
9999
else:
100100
opt = fluid.optimizer.DGCMomentumOptimizer(
101-
learning_rate=self.lr, momentum=0.9, rampup_begin_step=0)
101+
learning_rate=self.lr, momentum=0.9, rampup_begin_step=2)
102102

103103
# Reader
104104
train_reader = paddle.batch(

0 commit comments

Comments
 (0)