|
17 | 17 |
|
18 | 18 | from .. import core
|
19 | 19 | from . import layers
|
| 20 | +from . import parallel_helper |
20 | 21 | from .. import framework
|
21 |
| - |
22 | 22 | from ..layers import collective
|
23 | 23 | from . import to_variable
|
24 | 24 |
|
25 | 25 | __all__ = ["prepare_context"]
|
26 | 26 |
|
27 | 27 | ParallelStrategy = core.ParallelStrategy
|
28 | 28 |
|
29 |
| -__parallel_ctx__clz__ = None |
30 |
| - |
31 | 29 |
|
32 |
| -def prepare_context(parallel_strategy): |
33 |
| - global __parallel_ctx__clz__ |
34 |
| - assert __parallel_ctx__clz__ is None, "ParallelContext can only be initialized once." |
35 |
| - assert framework.in_dygraph_mode( |
36 |
| - ) is True, "dygraph.parallel.prepare_context should be used with dygrahp mode." |
| 30 | +def prepare_context(strategy=None): |
| 31 | + if strategy is None: |
| 32 | + strategy = ParallelStrategy() |
| 33 | + strategy.nranks = Env().nranks |
| 34 | + strategy.local_rank = Env().local_rank |
| 35 | + strategy.trainer_endpoints = Env().trainer_endpoints |
| 36 | + strategy.current_endpoint = Env().current_endpoint |
| 37 | + if strategy.nranks < 2: |
| 38 | + return |
| 39 | + assert framework.in_dygraph_mode() is True,\ |
| 40 | + "dygraph.parallel.prepare_context should be used with dygrahp mode." |
37 | 41 | place = framework._current_expected_place()
|
38 |
| - assert place is not None, "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard." |
39 |
| - |
| 42 | + assert place is not None, \ |
| 43 | + "dygraph.parallel.prepare_context should be used in fluid.dygraph.guard(place) guard." |
40 | 44 | if isinstance(place, core.CUDAPlace):
|
41 |
| - __parallel_ctx__clz__ = core.NCCLParallelContext(parallel_strategy, |
42 |
| - place) |
| 45 | + parallel_helper._set_parallel_ctx( |
| 46 | + core.NCCLParallelContext(strategy, place)) |
43 | 47 | else:
|
44 | 48 | # TODO(Yancey1989): add Gloo Parallel Context to support CPU parallel computation
|
45 | 49 | assert ("Only support CUDAPlace for now.")
|
46 |
| - __parallel_ctx__clz__.init() |
| 50 | + parallel_helper._init_parallel_ctx() |
| 51 | + return strategy |
47 | 52 |
|
48 | 53 |
|
49 | 54 | class Env(object):
|
@@ -77,33 +82,108 @@ def trainer_endpoints(self):
|
77 | 82 |
|
78 | 83 |
|
79 | 84 | class DataParallel(layers.Layer):
|
| 85 | + """ |
| 86 | + Runs the module with data parallelism. |
| 87 | +
|
| 88 | + Currently, DataParallel only supports to run the dynamic graph |
| 89 | + with multi-process. The usage is: |
| 90 | + `python -m paddle.distributed.launch --gpus 2 dynamic_graph_test.py`. |
| 91 | + And the content of `dynamic_graph_test.py` is the code of examples. |
| 92 | +
|
| 93 | + Examples: |
| 94 | + .. code-block:: python |
| 95 | +
|
| 96 | + import numpy as np |
| 97 | + import paddle.fluid as fluid |
| 98 | + import paddle.fluid.dygraph as dygraph |
| 99 | + from paddle.fluid.optimizer import AdamOptimizer |
| 100 | + from paddle.fluid.dygraph.nn import FC |
| 101 | + from paddle.fluid.dygraph.base import to_variable |
| 102 | +
|
| 103 | + place = fluid.CUDAPlace(0) |
| 104 | + with fluid.dygraph.guard(place=place): |
| 105 | +
|
| 106 | + # prepare the data parallel context |
| 107 | + strategy=dygraph.parallel.prepare_context() |
| 108 | +
|
| 109 | + fc_layer = FC("FC", 10, act="softmax") |
| 110 | + adam = fluid.optimizer.AdamOptimizer() |
| 111 | +
|
| 112 | + # make the module become the data parallelism module |
| 113 | + fc_layer = dygraph.parallel.DataParallel(fc_layer, strategy) |
| 114 | +
|
| 115 | + x_data = np.random.random(size=[10, 1]).astype(np.float32) |
| 116 | + data = to_variable(x_data) |
| 117 | +
|
| 118 | + hidden = fc_layer(data) |
| 119 | + avg_loss = fluid.layers.mean(hidden) |
| 120 | +
|
| 121 | + # scale the loss according to the number of trainers. |
| 122 | + avg_loss = fc_layer.scale_loss(avg_loss) |
| 123 | +
|
| 124 | + avg_loss.backward() |
| 125 | +
|
| 126 | + # collect the gradients of trainers. |
| 127 | + fc_layer.apply_collective_grads() |
| 128 | +
|
| 129 | + adam.minimize(avg_loss) |
| 130 | + fc_layer.clear_gradients() |
| 131 | +
|
| 132 | + Args: |
| 133 | + layers(Layer): The module that should be executed by data parallel. |
| 134 | + strategy(ParallelStrategy): The strategy of data parallelism. |
| 135 | +
|
| 136 | + Returns: |
| 137 | + Layer: The data paralleled module. |
| 138 | + """ |
| 139 | + |
80 | 140 | def __init__(self, layers, strategy):
|
81 | 141 | super(DataParallel,
|
82 | 142 | self).__init__(layers.full_name() + "_data_parallel")
|
| 143 | + |
83 | 144 | self._layers = layers
|
84 | 145 | self._strategy = strategy
|
85 | 146 |
|
86 | 147 | def forward(self, *inputs, **kwargs):
|
87 | 148 | return self._layers(*inputs, **kwargs)
|
88 | 149 |
|
89 | 150 | def scale_loss(self, loss):
|
90 |
| - if self._strategy.nranks < 2: |
| 151 | + """ |
| 152 | + Scale the loss. In data parallel mode, the loss should be scale with |
| 153 | + the number of trainers. If not in data parallel mode, return the loss |
| 154 | + directly. |
| 155 | +
|
| 156 | + Args: |
| 157 | + loss(Layer): The loss of the current Model. |
| 158 | +
|
| 159 | + Returns: |
| 160 | + Layer: the scaled loss. |
| 161 | + """ |
| 162 | + if not self._is_data_parallel_mode(): |
91 | 163 | return loss
|
| 164 | + |
92 | 165 | loss_scale = to_variable(
|
93 | 166 | np.array([self._strategy.nranks]).astype("float32"))
|
94 | 167 | loss_scale.stop_gradient = True
|
95 | 168 | loss = loss / loss_scale
|
96 | 169 | return loss
|
97 | 170 |
|
98 | 171 | def apply_collective_grads(self):
|
99 |
| - if self._strategy.nranks < 2: |
| 172 | + """ |
| 173 | + AllReduce the Parameters' gradient. |
| 174 | + """ |
| 175 | + if not self._is_data_parallel_mode(): |
100 | 176 | return
|
101 | 177 |
|
102 | 178 | for param in self._layers.parameters():
|
| 179 | + # NOTE(zcd): The grad_ivar maybe no generated. |
103 | 180 | if param.trainable and param._ivar._grad_ivar():
|
104 | 181 | g_var = framework.Variable(
|
105 | 182 | block=self._helper.main_program.current_block(),
|
106 | 183 | name=param._ivar._grad_name(),
|
107 | 184 | stop_gradient=True,
|
108 | 185 | ivar=param._ivar._grad_ivar())
|
109 | 186 | collective._allreduce(g_var, g_var, sync_mode=True)
|
| 187 | + |
| 188 | + def _is_data_parallel_mode(self): |
| 189 | + return self._strategy.nranks > 1 |
0 commit comments