Skip to content

Commit bda8b17

Browse files
authored
Merge pull request #251 from PaddlePaddle/release/0.1.1-beta.3
Release/0.1.1 beta.3
2 parents c7cc7c8 + 98d0abb commit bda8b17

28 files changed

+324
-58
lines changed

.travis.yml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,11 +8,11 @@ matrix:
88
- go get -u github.com/golang/lint/golint
99
- curl https://glide.sh/get | bash
1010
- sudo pip install pre-commit
11-
script:
12-
- |
11+
script:
12+
- |
1313
bash .tools/check_style.sh
1414
RESULT=$?; if [ $RESULT -eq 0 ]; then true; else false; fi;
15-
- bash .tools/gen_config.sh && cd go && glide install && go test $(glide novendor)
15+
- cd go && bash .tools/gen_config.sh && glide install && go test $(glide novendor)
1616
- language: python
1717
python: 2.7
1818
sudo: required

doc/tutorial_cn.md

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -76,9 +76,12 @@ cd ..
7676
paddlecloud submit -jobname fit-a-line -cpu 1 -gpu 1 -parallelism 1 -entry "python train.py" fit_a_line/
7777
```
7878

79-
可以看到在提交任务的时候,我们指定了任务的名称`-jobname fit-a-line`、使用的CPU资源`-cpu 1`
80-
使用的GPU资源`-gpu 1`、并行度`-parallelism 1`(训练节点个数),启动命令`-entry "python train.py"`
81-
和任务程序目录`fit_a_line/`
79+
可以看到在提交任务的时候,我们指定了以下参数:
80+
- `-jobname fit-a-line`, 任务名称
81+
- `-cpu 1`, 使用的CPU资源
82+
- `-parallelism 1`, 并行度(训练节点个数)
83+
- `-entry "python train.py"`, 启动命令
84+
- `fit_a_line` 任务程序目录
8285

8386
***说明1:*** 如果希望查看完整的任务提交参数说明,可以执行`paddlecloud submit -h`
8487

@@ -148,3 +151,6 @@ paddlecloud file get /pfs/dlnel/home/wuyi05@baidu.com/jobs/fit_a_line/output/pas
148151
```back
149152
paddlecloud kill fit-a-line
150153
```
154+
155+
---
156+
详细使用文档见:[中文使用文档](./usage_cn.md)

doc/usage_cn.md

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,62 @@ scp -r my_training_data_dir/ user@tunnel-server:/mnt/hdfs_mulan/idl/idl-dl/mydir
6868

6969
在训练任务提交后,每个训练节点会把HDFS挂载在`/pfs/[datacenter_name]/home/[username]/`目录下这样训练程序即可使用这个路径读取训练数据并开始训练。
7070

71+
### 使用[RecordIO](https://github.com/PaddlePaddle/recordio)对训练数据进行预处理
72+
用户需要在本地将数据预先处理为RecordIO的格式,再上传至集群进行训练。
73+
- 使用RecordIO库进行数据预处理
74+
```python
75+
import paddle.v2.dataset as dataset
76+
dataset.convert(output_path = "./dataset",
77+
reader = dataset.uci_housing.train(),
78+
num_shards = 10,
79+
name_prefix = "uci_housing_train")
80+
```
81+
- `output_path` 输出路径
82+
- `reader` 用户自定义的[reader](https://github.com/PaddlePaddle/Paddle/tree/develop/doc/design/reader),实现方法可以参考[paddle.v2.dataset.uci_housing.train()](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/uci_housing.py#L74)
83+
- `num_shards` 生成的文件数量
84+
- `num_prefix` 生成的文件名前缀
85+
86+
执行成功后会在本地生成如下文件:
87+
```bash
88+
.
89+
./dataset
90+
./dataset/uci_houseing_train-00000-of-00009
91+
./dataset/uci_houseing_train-00001-of-00009
92+
./dataset/uci_houseing_train-00002-of-00009
93+
./dataset/uci_houseing_train-00003-of-00009
94+
...
95+
```
96+
97+
- 编写reader来读取RecordIO格式的文件
98+
```python
99+
import cPickle as pickle
100+
import recordio
101+
import glob
102+
import sys
103+
def recordio_reader(filepath, parallelism, trainer_id):
104+
# sample filepath as "/pfs/dlnel/home/yanxu05@baidu.com/dataset/uci_housing/uci_housing_train*"
105+
def reader():
106+
if trainer_id >= parallelism:
107+
sys.stdout.write("invalied trainer_id: %d\n" % trainer_id)
108+
return
109+
files = glob.glob(filepath)
110+
files.sort()
111+
my_file_list = []
112+
for idx, f in enumerate(files):
113+
if idx % parallelism == trainer_id:
114+
my_file_list.append(f)
115+
116+
for fn in my_file_list:
117+
r = recordio.reader(fn)
118+
while True:
119+
d = r.read()
120+
if not d:
121+
break
122+
yield pickle.loads(d)
123+
124+
return reader
125+
```
126+
71127
### 使用paddlecloud上传训练数据
72128

73129
paddlecloud命令集成了上传数据的功能,目前仅针对存储系统是CephFS的环境。如果希望上传,执行:

docker/k8s_tools.py

Lines changed: 11 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -31,8 +31,8 @@ def wait_pods_running(label_selector, desired):
3131
print "running pod list: ", running_pod_list
3232
if len(running_pod_list) == int(desired):
3333
return [item[1] for item in running_pod_list]
34-
print "sleep for 10 seconds..."
35-
time.sleep(10)
34+
print "sleep for 5 seconds..."
35+
time.sleep(5)
3636

3737

3838
def fetch_pserver_ips():
@@ -42,12 +42,15 @@ def fetch_pserver_ips():
4242
return ",".join(pserver_ips)
4343

4444
def fetch_master_ip():
45-
label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
46-
pod_list = fetch_pods_info(label_selector)
47-
master_ip = ""
48-
if len(pod_list) >=1:
49-
master_ip = pod_list[0][1]
50-
return master_ip
45+
while True:
46+
label_selector = "paddle-job-master=%s" % PADDLE_JOB_NAME
47+
pod_list = fetch_pods_info(label_selector)
48+
master_ip = ""
49+
if len(pod_list) >=1:
50+
master_ip = pod_list[0][1]
51+
if master_ip:
52+
return master_ip
53+
time.sleep(5)
5154

5255
def fetch_trainer_id():
5356
label_selector = "paddle-job=%s" % PADDLE_JOB_NAME

docker/pfs/build.sh

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
cat > ./Dockerfile << EOF
2+
FROM ubuntu:16.04
3+
4+
RUN apt-get update && \
5+
apt-get install -y wget git && \
6+
wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
7+
tar -C /usr/local -xzf go.tgz && \
8+
mkdir /root/gopath && \
9+
rm go.tgz
10+
11+
ENV GOROOT=/usr/local/go GOPATH=/root/gopath
12+
ENV PATH=${PATH}:${GOROOT}/bin
13+
14+
CMD ["sh", "-c", "cd /root/gopath/src/github.com/PaddlePaddle/cloud/go/cmd/pfsserver && go get ./... && go build"]
15+
EOF
16+
17+
docker build . -t pfsserver:dev
18+
19+
rm -f Dockerfile

.tools/gen_config.sh renamed to go/.tools/gen_config.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,8 @@ mkdir -p ~/.paddle
33
cat > ~/.paddle/config << EOF
44
datacenters:
55
- name: datacenter1
6-
username: user@baidu.com
7-
password: T123
6+
username: your-user-name
7+
password: your-secret
88
endpoint: http://127.0.0.1:8080
99
current-datacenter: datacenter1
1010
EOF

go/Dockerfile

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
FROM ubuntu:16.04
2+
3+
ADD .tools /pfsserver/.tools
4+
RUN bash /pfsserver/.tools/gen_config.sh
5+
6+
ADD ./cmd/pfsserver/pfsserver /pfsserver/
7+
RUN mkdir /pfsserver/log

go/README_CN.md

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
1. 如何构建PFSServer的DockerImage
2+
- 构建PFSServer的编译环境
3+
4+
```
5+
cd cloud/docker/pfs
6+
bash build.sh
7+
```
8+
9+
- 编译PFSServer
10+
11+
```
12+
cd cloud/go
13+
docker run --rm -v $(pwd):/root/gopath/src/github.com/PaddlePaddle/cloud/go pfsserver:dev
14+
```
15+
16+
- 构建PFSServer的DockerImage
17+
18+
```
19+
cd cloud/go
20+
docker build . -t pfsserver:latest
21+
```
22+
- PFSServer启动命令
23+
24+
```
25+
docker run pfsserver:latest /pfsserver/pfsserver -tokenuri http://cloud.paddlepaddle.org -logtostderr=true -v=3
26+
```
27+
28+
2. 如何部署PFSServer
29+
30+
```
31+
cd ../k8s
32+
kuberctl create -f cloud_pfsserver.yaml
33+
```
34+
35+
3. 如何使用PFSClient
36+
- cp
37+
38+
```
39+
upload:
40+
paddlecloud cp ./file /pfs/$DATACENTER/home/$USER/file
41+
42+
download:
43+
paddlecloud cp /pfs/$DATACENTER/home/$USER/file ./file
44+
```
45+
- ls
46+
47+
```
48+
paddlecloud ls /pfs/$DATACENTER/home/$USER/folder
49+
```
50+
51+
- rm
52+
53+
```
54+
paddlecloud rm /pfs/$DATACENTER/home/$USER/file
55+
paddlecloud rm -r /pfs/$DATACENTER/home/$USER/folder
56+
```
57+
58+
- mkdir
59+
60+
```
61+
paddlecloud mkdir /pfs/$DATACENTER/home/$USER/folder
62+
```

go/filemanager/pfsmodules/command.go

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"errors"
55
"io"
66
"net/url"
7+
"path"
78
"strings"
89

910
log "github.com/golang/glog"
@@ -33,13 +34,16 @@ type Command interface {
3334

3435
// CheckUser checks if a user has authority to access a path.
3536
// path example:/pfs/$datacenter/home/$user
36-
func checkUser(path string, user string) error {
37-
a := strings.Split(path, "/")
38-
if len(a) < 3 {
37+
func checkUser(pathStr string, user string) error {
38+
pathStr = path.Clean(strings.TrimSpace(pathStr))
39+
a := strings.Split(pathStr, "/")
40+
// the first / is convert to " "
41+
if len(a) < 5 {
3942
return errors.New(StatusBadPath)
4043
}
4144

42-
if a[3] != user {
45+
if a[4] != user {
46+
log.V(4).Infof("request path:%s user:%s split_path:%s\n", pathStr, user, a[4])
4347
return errors.New(StatusUnAuthorized)
4448
}
4549
return nil
@@ -57,7 +61,7 @@ func ValidatePfsPath(paths []string, userName string) error {
5761
}
5862

5963
if err := checkUser(path, userName); err != nil {
60-
return errors.New(StatusShouldBePfsPath + ":" + path)
64+
return err
6165
}
6266
}
6367
return nil

go/filemanager/pfsmodules/download.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func remoteChunkMeta(path string,
2121
ChunkSize: chunkSize,
2222
}
2323

24-
t := fmt.Sprintf("%s/api/v1/chunks", Config.ActiveConfig.Endpoint)
24+
t := fmt.Sprintf("%s/api/v1/pfs/chunks", Config.ActiveConfig.Endpoint)
2525
ret, err := restclient.GetCall(t, cmd.ToURLParam())
2626
if err != nil {
2727
return nil, err
@@ -89,7 +89,7 @@ func downloadChunks(src string,
8989
return nil
9090
}
9191

92-
t := fmt.Sprintf("%s/api/v1/storage/chunks", Config.ActiveConfig.Endpoint)
92+
t := fmt.Sprintf("%s/api/v1/pfs/storage/chunks", Config.ActiveConfig.Endpoint)
9393
for _, meta := range diffMeta {
9494
chunk := Chunk{
9595
Path: src,

go/filemanager/pfsmodules/ls.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ func formatPrint(result []LsResult) {
249249

250250
// RemoteLs gets LsCmd result from cloud.
251251
func RemoteLs(cmd *LsCmd) ([]LsResult, error) {
252-
t := fmt.Sprintf("%s/api/v1/files", Config.ActiveConfig.Endpoint)
252+
t := fmt.Sprintf("%s/api/v1/pfs/files", Config.ActiveConfig.Endpoint)
253253
body, err := restclient.GetCall(t, cmd.ToURLParam())
254254
if err != nil {
255255
return nil, err

go/filemanager/pfsmodules/mkdir.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -126,7 +126,7 @@ func RemoteMkdir(cmd *MkdirCmd) ([]MkdirResult, error) {
126126
return nil, err
127127
}
128128

129-
t := fmt.Sprintf("%s/api/v1/files", Config.ActiveConfig.Endpoint)
129+
t := fmt.Sprintf("%s/api/v1/pfs/files", Config.ActiveConfig.Endpoint)
130130
log.V(2).Infoln(t)
131131
body, err := restclient.PostCall(t, j)
132132
if err != nil {

go/filemanager/pfsmodules/rm.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ func RemoteRm(cmd *RmCmd) ([]RmResult, error) {
162162
return nil, err
163163
}
164164

165-
t := fmt.Sprintf("%s/api/v1/files", Config.ActiveConfig.Endpoint)
165+
t := fmt.Sprintf("%s/api/v1/pfs/files", Config.ActiveConfig.Endpoint)
166166
body, err := restclient.DeleteCall(t, j)
167167
if err != nil {
168168
return nil, err

go/filemanager/pfsmodules/upload.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ import (
1717
var Config = config.ParseDefaultConfig()
1818

1919
func remoteStat(cmd *StatCmd) (*LsResult, error) {
20-
t := fmt.Sprintf("%s/api/v1/files", Config.ActiveConfig.Endpoint)
20+
t := fmt.Sprintf("%s/api/v1/pfs/files", Config.ActiveConfig.Endpoint)
2121
log.V(3).Infoln(t)
2222
body, err := restclient.GetCall(t, cmd.ToURLParam())
2323
if err != nil {
@@ -49,7 +49,7 @@ func remoteTouch(cmd *TouchCmd) error {
4949
return err
5050
}
5151

52-
t := fmt.Sprintf("%s/api/v1/files", Config.ActiveConfig.Endpoint)
52+
t := fmt.Sprintf("%s/api/v1/pfs/files", Config.ActiveConfig.Endpoint)
5353
body, err := restclient.PostCall(t, j)
5454
if err != nil {
5555
return err
@@ -108,7 +108,7 @@ func postChunk(src *Chunk, dst string) ([]byte, error) {
108108
}
109109
defer Close(f)
110110

111-
t := fmt.Sprintf("%s/api/v1/storage/chunks", Config.ActiveConfig.Endpoint)
111+
t := fmt.Sprintf("%s/api/v1/pfs/storage/chunks", Config.ActiveConfig.Endpoint)
112112
log.V(4).Infoln(t)
113113

114114
return restclient.PostChunk(t, getDstParam(src, dst),

go/filemanager/pfsserver/handler.go

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package pfsserver
33
import (
44
"encoding/json"
55
"errors"
6+
"fmt"
67
"io"
78
"io/ioutil"
89
"mime/multipart"
@@ -24,25 +25,32 @@ var TokenURI = ""
2425

2526
func getUserName(uri string, token string) (string, error) {
2627
authHeader := make(map[string]string)
27-
authHeader["Authorization"] = "Token " + token
28+
authHeader["Authorization"] = token
29+
30+
str := fmt.Sprintf("get uri with token error uri:%s token:%s\n", uri, token)
31+
2832
req, err := restclient.MakeRequest(uri, "GET", nil, "", nil, authHeader)
2933
if err != nil {
34+
log.Errorln(str)
3035
return "", err
3136
}
3237

3338
body, err := restclient.GetResponse(req)
3439
if err != nil {
40+
log.Errorln(str)
3541
return "", err
3642
}
3743

3844
log.V(4).Infoln("get token2user resp:" + string(body[:]))
3945
var resp interface{}
4046
if err := json.Unmarshal(body, &resp); err != nil {
47+
log.Errorln(string(body[:]))
4148
return "", err
4249
}
4350

4451
user := resp.(map[string]interface{})["user"].(string)
4552
if len(user) < 1 {
53+
log.Errorln(resp)
4654
return "", errors.New("can't get username")
4755
}
4856

0 commit comments

Comments
 (0)