12
12
# See the License for the specific language governing permissions and
13
13
# limitations under the License.
14
14
"""
15
- paddle.distributed.launch is a module that spawns multiple distributed
15
+ fleetrun is a module that spawns multiple distributed
16
16
process on each training node for gpu training and cpu training.
17
17
Usage:
18
18
In both of single node training or multiple node training, this module
31
31
your_training_py (arg1 arg2 and all others)
32
32
CPU training:
33
33
1. for single node training with multi servers and workers:
34
- fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
34
+ fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others)
35
35
2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
36
- with 2 servers and 4 workers.
36
+ with 2 servers and 4 workers.
37
37
on 192.168.0.16:
38
- fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171 " \
39
- --workers="192.168.0.16:6172 ,192.168.0.17:6173 ,192.168.0.16:6174 ,192.168.0.17:6175 " \
38
+ fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170 " \
39
+ --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
40
40
your_training_py (arg1 arg2 and all others)
41
41
on 192.168.0.17:
42
42
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
43
- --workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
43
+ --workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
44
+ your_training_py (arg1 arg2 and all others)
45
+ 3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
46
+ with 2 servers and 4 workers. (workers should set port)
47
+ on 192.168.0.16:
48
+ fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
49
+ --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
50
+ your_training_py (arg1 arg2 and all others)
51
+ on 192.168.0.17:
52
+ fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
53
+ --workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
44
54
your_training_py (arg1 arg2 and all others)
45
55
"""
46
56
@@ -215,6 +225,7 @@ def launch_collective(args):
215
225
216
226
def launch_ps (args ):
217
227
ports = None
228
+ start_port = 6170
218
229
if args .server_num :
219
230
server_num = args .server_num
220
231
ports = get_ports (server_num , 0 )
@@ -240,11 +251,19 @@ def launch_ps(args):
240
251
worker_endpoints_ips = [
241
252
x .strip ().split (":" )[0 ] for x in worker_endpoints .split ("," )
242
253
]
243
- worker_endpoints_port = [
244
- x .strip ().split (":" )[1 ] for x in worker_endpoints .split ("," )
245
- ]
246
254
worker_num = len (worker_endpoints_ips )
247
255
node_ips = list (set (server_endpoints_ips + worker_endpoints_ips ))
256
+ worker_endpoints_len = [
257
+ len (x .strip ().split (":" )) for x in worker_endpoints .split ("," )
258
+ ]
259
+ if 1 in worker_endpoints_len :
260
+ # if no port value in worker_endpoints, will set default port values.
261
+ worker_endpoints_port = range (start_port + server_num ,
262
+ start_port + server_num + worker_num , 1 )
263
+ else :
264
+ worker_endpoints_port = [
265
+ x .strip ().split (":" )[1 ] for x in worker_endpoints .split ("," )
266
+ ]
248
267
249
268
# local train
250
269
if len (set (node_ips )) == 1 :
0 commit comments