code refine

chengduozh · chengduozh · commit 1aedf4b78d7b · 2019-05-22T12:40:15.000+08:00
test=develop
diff --git a/paddle/fluid/operators/distributed_ops/allreduce_op.h b/paddle/fluid/operators/distributed_ops/allreduce_op.h
@@ -40,7 +40,6 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
     auto in = ctx.Input<framework::Tensor>("X");
     auto out = ctx.Output<framework::Tensor>("Out");
 
-    int in_dev_id = boost::get<platform::CUDAPlace>(in->place()).device;
     int dtype = platform::ToNCCLDataType(in->type());
     int64_t numel = in->numel();
     auto* sendbuff = in->data<void>();
@@ -68,8 +67,6 @@ class AllReduceOpKernel : public framework::OpKernel<T> {
         red_type = ncclMin;
         break;
     }
-    VLOG(3) << "AllReduce " << ctx.Inputs("X")[0] << " On " << in_dev_id;
-
     PADDLE_ENFORCE(platform::dynload::ncclAllReduce(
         sendbuff, recvbuff, numel, static_cast<ncclDataType_t>(dtype), red_type,
         comm, stream));
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
@@ -104,7 +104,7 @@ struct NCCLContextMap {
     PADDLE_ENFORCE_EQ(
         order_.size(), contexts_.size(),
         "NCCL Context Map does not support contain two or more same device");
-    // NOTE(paddle-dev): Why use std::unique_ptr and the T is ncclComm_t[] here?
+
     std::unique_ptr<ncclComm_t[]> comms(new ncclComm_t[order_.size()]);
     // if num_trainers == 1, should create a new nccl id for local comms.
     if (num_trainers == 1 && nccl_id == nullptr) {
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
@@ -156,7 +156,7 @@ def __call__(self, *inputs):
         if not self._built:
             self.build_once(*inputs)
             if parallel_helper._is_data_parallel_mode():
-                parallel_helper._broadcast_parameters(self.parameters())
+                parallel_helper._broadcast_parameters(self._parameters.values())
 
         outputs = self.forward(*inputs)
         self._built = True
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
@@ -52,9 +52,6 @@ def prepare_context(strategy=None):
 
 
 class Env(object):
-    """
-    """
-
     def __init__(self):
         self._nranks = int(os.getenv("PADDLE_TRAINERS_NUM", "1"))
         self._local_rank = int(os.getenv("PADDLE_TRAINER_ID", "0"))
@@ -86,25 +83,58 @@ def trainer_endpoints(self):
 
 class DataParallel(layers.Layer):
     """
-    DataParallel.
+    Runs the module with data parallelism.
+
+    Currently, DataParallel only supports to run the dynamic graph
+    with multi-process. The usage is:
+    `python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`.
+    And the content of `dynamic_graph_test.py` is the code of examples.
 
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy
-          import os
-          ...
+           import numpy as np
+           import paddle.fluid as fluid
+           import paddle.fluid.dygraph as dygraph
+           from paddle.fluid.optimizer import AdamOptimizer
+           from paddle.fluid.dygraph.nn import FC
+           from paddle.fluid.dygraph.base import to_variable
+
+           place = fluid.CUDAPlace(0)
+           with fluid.dygraph.guard(place=place):
+
+               # prepare the data parallel context
+               strategy=dygraph.parallel.prepare_context()
+
+               fc_layer = FC("FC", 10, act="softmax")
+               adam = fluid.optimizer.AdamOptimizer()
+
+               # make the module become the data parallelism module
+               fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy)
+
+               x_data = np.random.random(size=[10, 1]).astype(np.float32)
+               data = to_variable(x_data)
+
+               hidden = fc_layer(data)
+               avg_loss = fluid.layers.mean(hidden)
+
+               # scale the loss according to the number of trainers.
+               avg_loss = fc_layer.scale_loss(avg_loss)
+
+               avg_loss.backward()
+
+               # collect the gradients of trainers.
+               fc_layer.apply_collective_grads()
+
+               adam.minimize(avg_loss)
+               fc_layer.clear_gradients()
 
     Args:
-        layers(Layer): The layer.Layer.
-        strategy(ParallelStrategy): The dygraph.parallel.ParallelStrategy.
+        layers(Layer): The module that should be executed by data parallel.
+        strategy(ParallelStrategy): The strategy of data parallelism.
 
     Returns:
-        Layer: The layer.Layer..
-
-    Raises:
-        TypeError: If share_vars_from is provided, but not ParallelExecutor object.
+        Layer: The data paralleled module.
     """
 
     def __init__(self, layers, strategy):
@@ -119,12 +149,15 @@ def forward(self, *inputs, **kwargs):
 
     def scale_loss(self, loss):
         """
+        Scale the loss. In data parallel mode, the loss should be scale with
+        the number of trainers. If not in data parallel mode, return the loss
+        directly.
 
         Args:
-            loss(Layer): The layer.Layer.
+            loss(Layer): The loss of the current Model.
 
         Returns:
-            Layer: The layer.Layer.
+            Layer: the scaled loss.
         """
         if not self._is_data_parallel_mode():
             return loss
diff --git a/python/paddle/fluid/dygraph/parallel_helper.py b/python/paddle/fluid/dygraph/parallel_helper.py
@@ -39,4 +39,5 @@ def _init_parallel_ctx():
 
 def _broadcast_parameters(parameters):
     for param in parameters:
-        collective._broadcast(param, 0, sync_mode=True)
+        if param.trainable:
+            collective._broadcast(param, 0, sync_mode=True)