PaddlePaddle · kuke · May 21, 2019 · May 19, 2019 · May 19, 2019 · May 20, 2019
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
@@ -516,6 +516,12 @@ paddle.fluid.optimizer.DGCMomentumOptimizer.apply_optimize (ArgSpec(args=['self'
 paddle.fluid.optimizer.DGCMomentumOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.optimizer.DGCMomentumOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
+paddle.fluid.optimizer.LambOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'lamb_weight_decay', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.01, 0.9, 0.999, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.LambOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', 'bfe7305918552aaecfdaa22411dbe871'))
+paddle.fluid.optimizer.LambOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
+paddle.fluid.optimizer.LambOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
+paddle.fluid.optimizer.LambOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
+paddle.fluid.optimizer.LambOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '35fd5d3330c97903528c7e0dacc7f6ea'))
 paddle.fluid.backward.append_backward (ArgSpec(args=['loss', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '08a5dd9f6f376ff3d55e0b1d92115cbd'))
 paddle.fluid.regularizer.L1DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.regularizer.L2DecayRegularizer.__init__ (ArgSpec(args=['self', 'regularization_coeff'], varargs=None, keywords=None, defaults=(0.0,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))

diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc
@@ -18,67 +18,64 @@ namespace paddle {
 namespace operators {
 
 using Tensor = framework::Tensor;
-class AdamOp : public framework::OperatorWithKernel {
- public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE(ctx->HasInput("Param"),
-                   "Input(Param) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Grad"),
-                   "Input(Grad) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment1"),
-                   "Input(Moment1) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Moment2"),
-                   "Input(Moment2) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
-                   "Input(LearningRate) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
-                   "Input(Beta1Pow) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
-                   "Input(Beta2Pow) of AdamOp should not be null.");
-
-    PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
-                   "Output(ParamOut) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
-                   "Output(Moment1Out) of AdamOp should not be null.");
-    PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
-                   "Output(Moment2Out) of AdamOp should not be null.");
-
-    auto lr_dims = ctx->GetInputDim("LearningRate");
-    PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
-                      "Learning rate should have 1 dimension");
-    auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
-                      "Beta1 power accumulator should have 1 dimension");
-    auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
-    PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
-                      "Beta2 power accumulator should have 1 dimension");
-
-    auto param_dims = ctx->GetInputDim("Param");
-    if (ctx->GetInputsVarType("Grad")[0] ==
-        framework::proto::VarType::LOD_TENSOR) {
-      PADDLE_ENFORCE_EQ(
-          param_dims, ctx->GetInputDim("Grad"),
-          "Param and Grad input of AdamOp should have same dimension");
-    }
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment1"),
-        "Param and Moment1 input of AdamOp should have same dimension");
-    PADDLE_ENFORCE_EQ(
-        param_dims, ctx->GetInputDim("Moment2"),
-        "Param and Moment2 input of AdamOp should have same dimension");
 
-    ctx->SetOutputDim("ParamOut", param_dims);
-    ctx->SetOutputDim("Moment1Out", param_dims);
-    ctx->SetOutputDim("Moment2Out", param_dims);
-  }
-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext &ctx) const override {
-    auto input_data_type = ctx.Input<Tensor>("Param")->type();
-    return framework::OpKernelType(input_data_type, ctx.GetPlace());
+void AdamOp::InferShape(framework::InferShapeContext* ctx) const {
+  PADDLE_ENFORCE(ctx->HasInput("Param"),
+                 "Input(Param) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Grad"),
+                 "Input(Grad) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Moment1"),
+                 "Input(Moment1) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Moment2"),
+                 "Input(Moment2) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("LearningRate"),
+                 "Input(LearningRate) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Beta1Pow"),
+                 "Input(Beta1Pow) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasInput("Beta2Pow"),
+                 "Input(Beta2Pow) of AdamOp should not be null.");
+
+  PADDLE_ENFORCE(ctx->HasOutput("ParamOut"),
+                 "Output(ParamOut) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Moment1Out"),
+                 "Output(Moment1Out) of AdamOp should not be null.");
+  PADDLE_ENFORCE(ctx->HasOutput("Moment2Out"),
+                 "Output(Moment2Out) of AdamOp should not be null.");
+
+  auto lr_dims = ctx->GetInputDim("LearningRate");
+  PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
+                    "Learning rate should have 1 dimension");
+  auto beta1_pow_dims = ctx->GetInputDim("Beta1Pow");
+  PADDLE_ENFORCE_EQ(framework::product(beta1_pow_dims), 1,
+                    "Beta1 power accumulator should have 1 dimension");
+  auto beta2_pow_dims = ctx->GetInputDim("Beta2Pow");
+  PADDLE_ENFORCE_EQ(framework::product(beta2_pow_dims), 1,
+                    "Beta2 power accumulator should have 1 dimension");
+
+  auto param_dims = ctx->GetInputDim("Param");
+  if (ctx->GetInputsVarType("Grad")[0] ==
+      framework::proto::VarType::LOD_TENSOR) {
+    PADDLE_ENFORCE_EQ(
+        param_dims, ctx->GetInputDim("Grad"),
+        "Param and Grad input of AdamOp should have same dimension");
   }
-};
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment1"),
+      "Param and Moment1 input of AdamOp should have same dimension");
+  PADDLE_ENFORCE_EQ(
+      param_dims, ctx->GetInputDim("Moment2"),
+      "Param and Moment2 input of AdamOp should have same dimension");
+
+  ctx->SetOutputDim("ParamOut", param_dims);
+  ctx->SetOutputDim("Moment1Out", param_dims);
+  ctx->SetOutputDim("Moment2Out", param_dims);
+}
+
+framework::OpKernelType AdamOp::GetExpectedKernelType(
+    const framework::ExecutionContext& ctx) const {
+  auto input_data_type = ctx.Input<framework::Tensor>("Param")->type();
+  return framework::OpKernelType(input_data_type, ctx.GetPlace());
+}
 
 class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
  public:

diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h
@@ -29,6 +29,15 @@ namespace operators {
 
 namespace scatter = paddle::operators::math::scatter;
 
+class AdamOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override;
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override;
+};
+
 struct GPUAdam;
 struct CPUAdam;
 

diff --git a/paddle/fluid/operators/optimizers/lamb_op.cc b/paddle/fluid/operators/optimizers/lamb_op.cc
@@ -0,0 +1,95 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/optimizers/lamb_op.h"
+#include "paddle/fluid/operators/optimizers/adam_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LambOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Param",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input parameter that has to be updated.");
+    AddInput("Grad",
+             "(LoDTensor, default LoDTensor<float>) "
+             "Input gradient of the parameter.");
+    AddInput("LearningRate", "(Tensor) Learning rate.");
+    AddInput("Moment1", "(Tensor) Input first moment.");
+    AddInput("Moment2", "(Tensor) Input second moment.");
+    AddInput("Beta1Pow", "(Tensor) Input beta1 power accumulator.");
+    AddInput("Beta2Pow", "(Tensor) Input beta2 power accumulator.");
+
+    AddOutput("ParamOut", "(Tensor) Output parameter.");
+    AddOutput("Moment1Out", "(Tensor) Output first moment.");
+    AddOutput("Moment2Out", "(Tensor) Output second moment.");
+    AddAttr<float>("weight_decay", "(float) Weight decay rate.");
+    AddAttr<float>("beta1",
+                   "(float, default 0.9) The exponential decay rate for the "
+                   "1st moment estimates.")
+        .SetDefault(0.9);
+    AddAttr<float>("beta2",
+                   "(float, default 0.999) The exponential decay rate for the "
+                   "2nd moment estimates.")
+        .SetDefault(0.999);
+    AddAttr<float>("epsilon",
+                   "(float, default 1.0e-6) "
+                   "Constant for numerical stability.")
+        .SetDefault(1.0e-6f);
+
+    AddComment(R"DOC(
+LAMB (Layer-wise Adaptive Moments optimizer for Batching training) Optimizer.
+
+LAMB Optimizer is designed to scale up the batch size of training without losing 
+accuracy, which supports adaptive element-wise updating and accurate layer-wise 
+correction. For more information, please refer to https://arxiv.org/abs/1904.00962.
+
+The updating of parameters follows:
+
+$$
+m_t^l &= \beta_1 m_{t - 1}^l + (1 - \beta_1)g_t^l \\
+
+v_t^l &= \beta_2 v_{t - 1}^l + (1 - \beta_2)g_t^l \odot g_t^l \\
+
+\widehat{m}_t^l &= m_t^l/(1 - \beta_1^t) \\
+
+\widehat{v}_t^l &= v_t^l/(1 - \beta_2^t) \\
+
+r_1 &= \left \| w_{t-1}^l \right \|_2 \\
+
+r_2 &= \left \|  \frac{\widehat{m}_t^l}{\sqrt{\widehat{v}_t^l+\epsilon}} + \lambda w_{t-1}^l \right \|_2 \\
+
+r &= r_1 / r_2 \\
+
+\eta^l &= r \times \eta \\
+
+w_t^l &= w_{t-1}^l -\eta ^l \times (\frac{\widehat{m}_t^l}{\sqrt{\widehat{v}_t^l+\epsilon}} + \lambda w_{t-1}^l)
+$$
+
+where $m$ is the 1st moment, and $v$ the 2nd moment, $\eta$ the 
+learning rate, $\lambda$ the weight decay rate.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(lamb, ops::AdamOp, ops::LambOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    lamb, ops::LambOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::LambOpKernel<paddle::platform::CPUDeviceContext, double>);
diff --git a/paddle/fluid/operators/optimizers/lamb_op.cu b/paddle/fluid/operators/optimizers/lamb_op.cu
@@ -0,0 +1,20 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/optimizers/lamb_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    lamb, ops::LambOpKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LambOpKernel<paddle::platform::CUDADeviceContext, double>);