Skip to content

Commit f1a8f7a

Browse files
authored
Merge pull request #1227 from helinwang/k8s_aws
paddle on aws with kubernetes tutorial now works
2 parents b1f09f2 + 17867fb commit f1a8f7a

12 files changed

+331
-362
lines changed

doc/howto/usage/k8s/k8s_aws_en.md

+255-304
Large diffs are not rendered by default.
-116 KB
Loading
-8.31 KB
Loading

doc/howto/usage/k8s/src/job.yaml

-43
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM alpine
2+
3+
RUN apk update && apk upgrade && apk add coreutils
4+
ADD quick_start /quick_start
5+
ADD get_data.sh /bin/
6+
RUN chmod +x /bin/get_data.sh
7+
ENTRYPOINT ["/bin/get_data.sh"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
To build PaddlePaddle data preparation image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following commands:
2+
3+
```
4+
cp -r ../../../../../../demo/quick_start .
5+
docker build . -t prepare-data-image-name
6+
```
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
#!/bin/sh
2+
3+
out_dir=$OUT_DIR
4+
split_count=$SPLIT_COUNT
5+
6+
set -e
7+
8+
mkdir -p $out_dir
9+
cp -r /quick_start $out_dir/
10+
11+
mkdir -p $out_dir/0/data
12+
cd $out_dir/0/data
13+
wget http://paddlepaddle.bj.bcebos.com/demo/quick_start_preprocessed_data/preprocessed_data.tar.gz
14+
tar zxvf preprocessed_data.tar.gz
15+
rm preprocessed_data.tar.gz
16+
17+
split -d --number=l/$split_count -a 5 train.txt train.
18+
mv train.00000 train.txt
19+
20+
cd $out_dir
21+
end=$(expr $split_count - 1)
22+
for i in $(seq 1 $end); do
23+
mkdir -p $i/data
24+
cp -r 0/data/* $i/data
25+
mv $i/data/train.`printf %05d $i` $i/data/train.txt
26+
done;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
FROM paddledev/paddle:cpu-latest
2+
3+
COPY start.sh /root/
4+
COPY start_paddle.py /root/
5+
RUN chmod +x /root/start.sh
6+
CMD ["bash"," -c","/root/start.sh"]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
To build PaddlePaddle training image in tutorial [Distributed PaddlePaddle Training on AWS with Kubernetes](../../k8s_aws_en.md), run following command:
2+
3+
```
4+
docker build . -t train-image-name
5+
```
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,19 @@
11
#!/bin/sh
2+
23
set -eu
34

45
jobconfig=${JOB_PATH}"/"${JOB_NAME}"/"${TRAIN_CONFIG_DIR}
56
cd /root
6-
cp -rf $jobconfig .
7-
cd $TRAIN_CONFIG_DIR
8-
7+
cp -rf $jobconfig/* .
98

109
python /root/start_paddle.py \
1110
--dot_period=10 \
12-
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM \
11+
--ports_num=$CONF_PADDLE_PORTS_NUM \
12+
--ports_num_for_sparse=$CONF_PADDLE_PORTS_NUM_SPARSE \
1313
--log_period=50 \
1414
--num_passes=10 \
15-
--trainer_count=4 \
15+
--trainer_count=$TRAINER_COUNT \
1616
--saving_period=1 \
1717
--local=0 \
18-
--config=./trainer_config.py \
18+
--config=trainer_config.lr.py \
1919
--use_gpu=0

doc/howto/usage/k8s/src/start_paddle.py renamed to doc/howto/usage/k8s/src/k8s_train/start_paddle.py

+20-9
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
API = "/api/v1/namespaces/"
2424
JOBSELECTOR = "labelSelector=job-name="
2525
JOB_PATH = os.getenv("JOB_PATH") + "/" + os.getenv("JOB_NAME")
26-
JOB_PATH_DATA = JOB_PATH + "/data"
2726
JOB_PATH_OUTPUT = JOB_PATH + "/output"
2827
JOBNAME = os.getenv("JOB_NAME")
2928
NAMESPACE = os.getenv("JOB_NAMESPACE")
@@ -33,6 +32,8 @@
3332
PADDLE_PORTS_NUM_SPARSE = os.getenv("CONF_PADDLE_PORTS_NUM_SPARSE")
3433
PADDLE_SERVER_NUM = os.getenv("CONF_PADDLE_GRADIENT_NUM")
3534

35+
tokenpath = '/var/run/secrets/kubernetes.io/serviceaccount/token'
36+
3637

3738
def refine_unknown_args(cmd_args):
3839
'''
@@ -64,6 +65,7 @@ def isPodAllRunning(podlist):
6465
for pod in podlist["items"]:
6566
if pod["status"]["phase"] == "Running":
6667
running += 1
68+
print "waiting for pods running, require:", require, "running:", running
6769
if require == running:
6870
return True
6971
return False
@@ -79,8 +81,17 @@ def getPodList():
7981

8082
pod = API + NAMESPACE + "/pods?"
8183
job = JOBNAME
82-
return requests.get(apiserver + pod + JOBSELECTOR + job,
83-
verify=False).json()
84+
if os.path.isfile(tokenpath):
85+
tokenfile = open(tokenpath, mode='r')
86+
token = tokenfile.read()
87+
Bearer = "Bearer " + token
88+
headers = {"Authorization": Bearer}
89+
return requests.get(apiserver + pod + JOBSELECTOR + job,
90+
headers=headers,
91+
verify=False).json()
92+
else:
93+
return requests.get(apiserver + pod + JOBSELECTOR + job,
94+
verify=False).json()
8495

8596

8697
def getIdMap(podlist):
@@ -122,8 +133,8 @@ def startPaddle(idMap={}, train_args_dict=None):
122133
if not os.path.exists(JOB_PATH_OUTPUT):
123134
os.makedirs(JOB_PATH_OUTPUT)
124135
os.mkdir(logDir)
125-
copyCommand = 'cp -rf ' + JOB_PATH_DATA + \
126-
"/" + str(trainerId) + " ./data"
136+
copyCommand = 'cp -rf ' + JOB_PATH + \
137+
"/" + str(trainerId) + "/data/*" + " ./data/"
127138
os.system(copyCommand)
128139
startPserver = 'nohup paddle pserver' + \
129140
" --port=" + str(PADDLE_PORT) + \
@@ -136,9 +147,9 @@ def startPaddle(idMap={}, train_args_dict=None):
136147
print startPserver
137148
os.system(startPserver)
138149
# wait until pservers completely start
139-
time.sleep(10)
140-
startTrainer = program + args + " > " + \
141-
logDir + "/train.log 2>&1 < /dev/null"
150+
time.sleep(20)
151+
startTrainer = program + args + " 2>&1 | tee " + \
152+
logDir + "/train.log"
142153
print startTrainer
143154
os.system(startTrainer)
144155

@@ -152,7 +163,7 @@ def startPaddle(idMap={}, train_args_dict=None):
152163
podlist = getPodList()
153164
# need to wait until all pods are running
154165
while not isPodAllRunning(podlist):
155-
time.sleep(10)
166+
time.sleep(20)
156167
podlist = getPodList()
157168
idMap = getIdMap(podlist)
158169
startPaddle(idMap, train_args_dict)
Loading

0 commit comments

Comments
 (0)