Fix DGC algorithm flow to make it the same as paper (#20758)

wangxicoding · gongweibao · commit 250e72d254cc · 2019-10-24T10:44:50.000+08:00
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cc b/paddle/fluid/operators/optimizers/dgc_momentum_op.cc
@@ -0,0 +1,68 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
+
+namespace paddle {
+namespace operators {
+
+class DGCMomentumOp : public MomentumOp {
+ public:
+  using MomentumOp::MomentumOp;
+
+ protected:
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput("current_step"), true,
+                      "current_step should be set.");
+    return MomentumOp::InferShape(ctx);
+  }
+
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    if (var_name == "current_step") {
+      VLOG(10) << "var_name:" << var_name << " need not to transform";
+      return expected_kernel_type;
+    }
+
+    return framework::OperatorWithKernel::GetKernelTypeForVar(
+        var_name, tensor, expected_kernel_type);
+  }
+};
+
+class DGCMomentumOpMaker : public MomentumOpMaker {
+ public:
+  void Make() override {
+    AddInput("current_step", "(Tensor) Current step.");
+    AddAttr<float>("rampup_begin_step",
+                   "(float, -1.0)"
+                   "The period when begin DGC.")
+        .SetDefault(-1.0);
+
+    return MomentumOpMaker::Make();
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(dgc_momentum, ops::DGCMomentumOp,
+                             ops::DGCMomentumOpMaker);
+
+REGISTER_OP_CPU_KERNEL(
+    dgc_momentum,
+    ops::DGCMomentumKernel<paddle::platform::CPUDeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.cu b/paddle/fluid/operators/optimizers/dgc_momentum_op.cu
@@ -0,0 +1,20 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/optimizers/dgc_momentum_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    dgc_momentum,
+    ops::DGCMomentumKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/optimizers/dgc_momentum_op.h b/paddle/fluid/operators/optimizers/dgc_momentum_op.h
@@ -0,0 +1,59 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <memory>
+
+#include "paddle/fluid/operators/optimizers/momentum_op.h"
+#include "paddle/fluid/operators/optimizers/sgd_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class DGCMomentumKernel : public framework::OpKernel<T> {
+ public:
+  DGCMomentumKernel()
+      : _momentum_op_kernel(new MomentumOpKernel<DeviceContext, T>()),
+        _sgd_op_kernel(new SGDOpKernel<DeviceContext, T>()) {}
+
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto rampup_begin_step = context.Attr<float>("rampup_begin_step");
+    if (static_cast<int>(rampup_begin_step) < 0) {
+      return;
+    }
+
+    auto current_step_tensor = context.Input<framework::Tensor>("current_step");
+    auto* current_step = current_step_tensor->data<T>();
+
+    VLOG(10) << "current_step:" << *current_step
+             << ", rampup_begin_step:" << rampup_begin_step;
+
+    if (static_cast<int>(*current_step) < static_cast<int>(rampup_begin_step)) {
+      VLOG(10) << " so use momentum optimizer";
+      return _momentum_op_kernel->Compute(context);
+    }
+
+    VLOG(10) << " so use sgd optimizer";
+    return _sgd_op_kernel->Compute(context);
+  }
+
+ private:
+  std::unique_ptr<MomentumOpKernel<DeviceContext, T>> _momentum_op_kernel;
+  std::unique_ptr<SGDOpKernel<DeviceContext, T>> _sgd_op_kernel;
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/optimizers/momentum_op.cc b/paddle/fluid/operators/optimizers/momentum_op.cc
@@ -37,36 +37,34 @@ class MomentumOpInferVarType : public framework::VarTypeInference {
   }
 };
 
-class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  void Make() override {
-    AddInput("Param",
-             "(Tensor, default Tensor<float>) "
-             "Input parameter that has to be updated");
-    AddInput("Grad",
-             "(Tensor, default Tensor<float>) "
-             "Input gradient of the parameter");
-    AddInput("Velocity",
-             "(Tensor, default Tensor<float>) "
-             "Input velocity (corresponding to the parameter) "
-             "that has to be updated");
-    AddInput("LearningRate",
-             "(Tensor, default Tensor<float>) "
-             "Input learning rate");
+void MomentumOpMaker::Make() {
+  AddInput("Param",
+           "(Tensor, default Tensor<float>) "
+           "Input parameter that has to be updated");
+  AddInput("Grad",
+           "(Tensor, default Tensor<float>) "
+           "Input gradient of the parameter");
+  AddInput("Velocity",
+           "(Tensor, default Tensor<float>) "
+           "Input velocity (corresponding to the parameter) "
+           "that has to be updated");
+  AddInput("LearningRate",
+           "(Tensor, default Tensor<float>) "
+           "Input learning rate");
 
-    AddOutput("ParamOut",
-              "(Tensor) This output is updated parameter. "
-              "It shared memory with Input(Param).");
-    AddOutput("VelocityOut",
-              "(Tensor) This output is updated velocity. "
-              "It shared memory with Input(Velocity).");
+  AddOutput("ParamOut",
+            "(Tensor) This output is updated parameter. "
+            "It shared memory with Input(Param).");
+  AddOutput("VelocityOut",
+            "(Tensor) This output is updated velocity. "
+            "It shared memory with Input(Velocity).");
 
-    AddAttr<float>("mu", "(float) Momentum coefficient");
-    AddAttr<bool>("use_nesterov",
-                  "(bool, default false) "
-                  "Use Nesterov Momentum")
-        .SetDefault(false);
-    AddComment(R"DOC(
+  AddAttr<float>("mu", "(float) Momentum coefficient");
+  AddAttr<bool>("use_nesterov",
+                "(bool, default false) "
+                "Use Nesterov Momentum")
+      .SetDefault(false);
+  AddComment(R"DOC(
 Momentum Optimizer.
 
 This optimizer has a flag for Nestrov Momentum.
@@ -81,8 +79,8 @@ else:   \\
 $$
 
 )DOC");
-  }
-};
+}
+
 }  // namespace operators
 }  // namespace paddle
 
diff --git a/paddle/fluid/operators/optimizers/momentum_op.h b/paddle/fluid/operators/optimizers/momentum_op.h
@@ -29,6 +29,11 @@ using framework::SelectedRows;
 struct NoNesterov;
 struct UseNesterov;
 
+class MomentumOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override;
+};
+
 class MomentumOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cc b/paddle/fluid/operators/optimizers/sgd_op.cc
@@ -110,4 +110,6 @@ This operator implements one step of the stochastic gradient descent algorithm.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(sgd, ops::SGDOp, ops::SGDOpMaker,
                   paddle::framework::EmptyGradOpMaker, ops::SGDOpInferVarType);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<float>, ops::SGDOpKernel<double>);
+REGISTER_OP_CPU_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.cu b/paddle/fluid/operators/optimizers/sgd_op.cu
@@ -53,7 +53,8 @@ __global__ void SparseSGDFunctorKernel(const T* selected_rows,
 }  // namespace
 
 template <typename T>
-class SGDOpCUDAKernel : public framework::OpKernel<T> {
+class SGDOpKernel<platform::CUDADeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext& ctx) const override {
     const auto* param_var = ctx.InputVar("Param");
@@ -123,6 +124,7 @@ class SGDOpCUDAKernel : public framework::OpKernel<T> {
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(sgd, ops::SGDOpCUDAKernel<float>,
-                        ops::SGDOpCUDAKernel<double>,
-                        ops::SGDOpCUDAKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(
+    sgd, ops::SGDOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::SGDOpKernel<paddle::platform::CUDADeviceContext, plat::float16>);
diff --git a/paddle/fluid/operators/optimizers/sgd_op.h b/paddle/fluid/operators/optimizers/sgd_op.h
@@ -21,8 +21,15 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
-template <typename T>
+template <typename DeviceContext, typename T>
 class SGDOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override;
+};
+
+template <typename T>
+class SGDOpKernel<platform::CPUDeviceContext, T>
+    : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py
@@ -959,6 +959,47 @@ def __init__(self,
         super(DGCMomentumOptimizer, self).__init__(
             learning_rate, momentum, use_nesterov, regularization, name)
 
+    def _is_use_dgc(self, param_var, grad_var):
+        var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
+        if var_numel < 16384 or \
+           param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
+           grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
+               param_var.dtype != core.VarDesc.VarType.FP32 :
+            return False
+        return True
+
+    def _append_optimize_op(self, block, param_and_grad):
+        assert isinstance(block, framework.Block)
+
+        if not self._is_use_dgc(param_and_grad[0], param_and_grad[1]):
+            return super(DGCMomentumOptimizer, self)._append_optimize_op(
+                block, param_and_grad)
+
+        velocity_acc = self._get_accumulator(self._velocity_acc_str,
+                                             param_and_grad[0])
+        # create the dgc momentum optimize op
+        dgc_momentum_op = block.append_op(
+            type="dgc_momentum",
+            inputs={
+                "Param": param_and_grad[0],
+                "Grad": param_and_grad[1],
+                "Velocity": velocity_acc,
+                "LearningRate": self._create_param_lr(param_and_grad),
+                "current_step": self._global_step_var,
+            },
+            outputs={
+                "ParamOut": param_and_grad[0],
+                "VelocityOut": velocity_acc
+            },
+            attrs={
+                "mu": self._momentum,
+                "use_nesterov": self._use_nesterov,
+                "rampup_begin_step": float(self._rampup_begin_step)
+            },
+            stop_gradient=True)
+
+        return dgc_momentum_op
+
     def _add_auto_increment_var(self, counter_name, begin, step=1):
         helper = LayerHelper('global_step_counter')
         counter, is_new_var = helper.create_or_get_global_variable(
@@ -997,11 +1038,7 @@ def _append_dgc_ops(self, param_and_grads):
             force_cpu=True)
 
         for param_var, grad_var in param_and_grads:
-            var_numel = abs(reduce(lambda x, y: x * y, param_var.shape))
-            if var_numel < 16384 or \
-                param_var.type == core.VarDesc.VarType.SELECTED_ROWS  or \
-                grad_var.type == core.VarDesc.VarType.SELECTED_ROWS  or  \
-                    param_var.dtype != core.VarDesc.VarType.FP32 :
+            if not self._is_use_dgc(param_var, grad_var):
                 continue
 
             u_var = tensor.create_global_var(
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -10,6 +10,8 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
+list(APPEND MIXED_DIST_TEST_OPS test_dgc_momentum_op)
+list(APPEND MIXED_DIST_TEST_OPS test_dgc_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_simple_dist_transpiler)
 list(APPEND MIXED_DIST_TEST_OPS test_listen_and_serv_op)
 list(APPEND MIXED_DIST_TEST_OPS test_nce_remote_table_op)
@@ -248,6 +250,8 @@ if(WITH_DISTRIBUTE)
     py_test_modules(test_nce_remote_table_op MODULES test_nce_remote_table_op ENVS ${dist_ENVS})
     if(WITH_DGC)
         py_test_modules(test_dgc_op MODULES test_dgc_op)
+        py_test_modules(test_dgc_momentum_op MODULES test_dgc_momentum_op)
+        py_test_modules(test_dgc_optimizer MODULES test_dgc_optimizer)
     endif()
     if(NOT APPLE)
         bash_test_modules(test_listen_and_serv_op MODULES test_listen_and_serv.sh)
diff --git a/python/paddle/fluid/tests/unittests/dist_mnist.py b/python/paddle/fluid/tests/unittests/dist_mnist.py
@@ -98,7 +98,7 @@ def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None):
             opt = fluid.optimizer.Momentum(learning_rate=self.lr, momentum=0.9)
         else:
             opt = fluid.optimizer.DGCMomentumOptimizer(
-                learning_rate=self.lr, momentum=0.9, rampup_begin_step=0)
+                learning_rate=self.lr, momentum=0.9, rampup_begin_step=2)
 
         # Reader
         train_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py b/python/paddle/fluid/tests/unittests/test_dgc_momentum_op.py
diff --git a/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py b/python/paddle/fluid/tests/unittests/test_dgc_optimizer.py
diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py b/python/paddle/fluid/tests/unittests/test_dist_mnist_dgc_nccl.py