Skip to content

Commit 34eb27a

Browse files
authored
ps worker-ports are optional for users for fleetrun command; test=develop (#26090)
1 parent 615e8a2 commit 34eb27a

File tree

2 files changed

+37
-10
lines changed

2 files changed

+37
-10
lines changed

python/paddle/fleet/launch.py

+28-9
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414
"""
15-
paddle.distributed.launch is a module that spawns multiple distributed
15+
fleetrun is a module that spawns multiple distributed
1616
process on each training node for gpu training and cpu training.
1717
Usage:
1818
In both of single node training or multiple node training, this module
@@ -31,16 +31,26 @@
3131
your_training_py (arg1 arg2 and all others)
3232
CPU training:
3333
1. for single node training with multi servers and workers:
34-
fleetrun --server_num=1 --worker_num=4 your_training_py (arg1 arg2 and all others)
34+
fleetrun --server_num=2 --worker_num=2 your_training_py (arg1 arg2 and all others)
3535
2. for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
36-
with 2 servers and 4 workers.
36+
with 2 servers and 4 workers.
3737
on 192.168.0.16:
38-
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
39-
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
38+
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
39+
--workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
4040
your_training_py (arg1 arg2 and all others)
4141
on 192.168.0.17:
4242
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6171" \
43-
--workers="192.168.0.16:6172,192.168.0.17:6173,192.168.0.16:6174,192.168.0.17:6175" \
43+
--workers="192.168.0.16,192.168.0.17,192.168.0.16,192.168.0.17" \
44+
your_training_py (arg1 arg2 and all others)
45+
3. use gloo backend for multiple node training such as two node:192.168.0.16, 192.168.0.17 \
46+
with 2 servers and 4 workers. (workers should set port)
47+
on 192.168.0.16:
48+
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
49+
--workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
50+
your_training_py (arg1 arg2 and all others)
51+
on 192.168.0.17:
52+
fleetrun --servers="192.168.0.16:6170,192.168.0.17:6170" \
53+
--workers="192.168.0.16:6171,192.168.0.17:6171,192.168.0.16:6172,192.168.0.17:6172" \
4454
your_training_py (arg1 arg2 and all others)
4555
"""
4656

@@ -215,6 +225,7 @@ def launch_collective(args):
215225

216226
def launch_ps(args):
217227
ports = None
228+
start_port = 6170
218229
if args.server_num:
219230
server_num = args.server_num
220231
ports = get_ports(server_num, 0)
@@ -240,11 +251,19 @@ def launch_ps(args):
240251
worker_endpoints_ips = [
241252
x.strip().split(":")[0] for x in worker_endpoints.split(",")
242253
]
243-
worker_endpoints_port = [
244-
x.strip().split(":")[1] for x in worker_endpoints.split(",")
245-
]
246254
worker_num = len(worker_endpoints_ips)
247255
node_ips = list(set(server_endpoints_ips + worker_endpoints_ips))
256+
worker_endpoints_len = [
257+
len(x.strip().split(":")) for x in worker_endpoints.split(",")
258+
]
259+
if 1 in worker_endpoints_len:
260+
# if no port value in worker_endpoints, will set default port values.
261+
worker_endpoints_port = range(start_port + server_num,
262+
start_port + server_num + worker_num, 1)
263+
else:
264+
worker_endpoints_port = [
265+
x.strip().split(":")[1] for x in worker_endpoints.split(",")
266+
]
248267

249268
# local train
250269
if len(set(node_ips)) == 1:

python/paddle/fluid/tests/unittests/test_fleet_launch.sh

+9-1
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,15 @@ function test_launch_ps(){
1111
exit -1
1212
fi
1313

14-
fleetrun --servers="120.0.0.1:6780,120.0.0.1:6781" --workers="120.0.0.1:6782,120.0.0.1:6783" fleet_ps_training.py 2> ut.elog
14+
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1:6782,127.0.0.1:6783" fleet_ps_training.py 2> ut.elog
15+
if grep -q "server are killed" ut.elog; then
16+
echo "test pserver launch succeed"
17+
else
18+
echo "test pserver launch failed"
19+
exit -1
20+
fi
21+
22+
fleetrun --servers="127.0.0.1:6780,127.0.0.1:6781" --workers="127.0.0.1,127.0.0.1" fleet_ps_training.py 2> ut.elog
1523
if grep -q "server are killed" ut.elog; then
1624
echo "test pserver launch succeed"
1725
else

0 commit comments

Comments
 (0)