Skip to content

[Add GPUPS CI] GPUBox unittest #50130

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Feb 2, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions python/paddle/fluid/tests/unittests/ps/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,7 @@ foreach(TEST_OP ${TEST_OPS})
list(APPEND TEST_OPS ${TEST_OP})
set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 50)
endforeach()

if(WITH_PSCORE)
set_tests_properties(test_gpubox_ps PROPERTIES LABELS "RUN_TYPE=GPUPS")
endif()
55 changes: 55 additions & 0 deletions python/paddle/fluid/tests/unittests/ps/config_gpubox.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# workspace
#workspace: "models/rank/dnn"


runner:
train_data_dir: "data/sample_data/train"
train_reader_path: "criteo_reader" # importlib format
use_gpu: True
use_auc: False
train_batch_size: 32
epochs: 3
print_interval: 10
model_save_path: "output_model_dnn_queue"

sync_mode: "gpubox"
thread_num: 30
reader_type: "InmemoryDataset" # DataLoader / QueueDataset / RecDataset / InmemoryDataset
pipe_command: "python3.7 dataset_generator_criteo.py"
dataset_debug: False
split_file_list: False

infer_batch_size: 2
infer_reader_path: "criteo_reader" # importlib format
test_data_dir: "data/sample_data/train"
infer_load_path: "output_model_dnn_queue"
infer_start_epoch: 0
infer_end_epoch: 3
# hyper parameters of user-defined network
hyper_parameters:
# optimizer config
optimizer:
class: Adam
learning_rate: 0.001
strategy: async
# user-defined <key, value> pairs
sparse_inputs_slots: 27
sparse_feature_number: 1024
sparse_feature_dim: 9
dense_input_dim: 13
fc_sizes: [512, 256, 128, 32]
distributed_embedding: 0
86 changes: 86 additions & 0 deletions python/paddle/fluid/tests/unittests/ps/dataset_generator_criteo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging

import paddle.distributed.fleet as fleet

logging.basicConfig(
format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO
)
logger = logging.getLogger(__name__)


class Reader(fleet.MultiSlotDataGenerator):
def init(self):
padding = 0
sparse_slots = "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
self.sparse_slots = sparse_slots.strip().split(" ")
self.dense_slots = ["dense_feature"]
self.dense_slots_shape = [13]
self.slots = self.sparse_slots + self.dense_slots
self.slot2index = {}
self.visit = {}
for i in range(len(self.slots)):
self.slot2index[self.slots[i]] = i
self.visit[self.slots[i]] = False
self.padding = padding
logger.info("pipe init success")

def line_process(self, line):
line = line.strip().split(" ")
output = [(i, []) for i in self.slots]
for i in line:
slot_feasign = i.split(":")
slot = slot_feasign[0]
if slot not in self.slots:
continue
if slot in self.sparse_slots:
feasign = int(slot_feasign[1])
else:
feasign = float(slot_feasign[1])
output[self.slot2index[slot]][1].append(feasign)
self.visit[slot] = True
for i in self.visit:
slot = i
if not self.visit[slot]:
if i in self.dense_slots:
output[self.slot2index[i]][1].extend(
[self.padding]
* self.dense_slots_shape[self.slot2index[i]]
)
else:
output[self.slot2index[i]][1].extend([self.padding])
else:
self.visit[slot] = False

return output
# return [label] + sparse_feature + [dense_feature]

def generate_sample(self, line):
r"Dataset Generator"

def reader():
output_dict = self.line_process(line)
# {key, value} dict format: {'labels': [1], 'sparse_slot1': [2, 3], 'sparse_slot2': [4, 5, 6, 8], 'dense_slot': [1,2,3,4]}
# dict must match static_model.create_feed()
yield output_dict

return reader


if __name__ == "__main__":

r = Reader()
r.init()
r.run_from_stdin()
17 changes: 17 additions & 0 deletions python/paddle/fluid/tests/unittests/ps/download_criteo_data.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

wget --no-check-certificate https://paddlerec.bj.bcebos.com/benchmark/sample_train.txt
mkdir train_data
mv sample_train.txt train_data/
60 changes: 60 additions & 0 deletions python/paddle/fluid/tests/unittests/ps/gpubox_run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# !/bin/bash

# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

if [ ! -d "./log" ]; then
mkdir ./log
echo "Create log floder for store running log"
fi

export FLAGS_LAUNCH_BARRIER=0
export PADDLE_TRAINER_ID=0
export PADDLE_PSERVER_NUMS=1
export PADDLE_TRAINERS=1
export PADDLE_TRAINERS_NUM=${PADDLE_TRAINERS}
export POD_IP=127.0.0.1

# set free port if 29011 is occupied
export PADDLE_PSERVERS_IP_PORT_LIST="127.0.0.1:29011"
export PADDLE_PSERVER_PORT_ARRAY=(29011)

# set gpu numbers according to your device
export FLAGS_selected_gpus="0,1,2,3,4,5,6,7"
#export FLAGS_selected_gpus="0,1"

# set your model yaml
#SC="gpubox_ps_trainer.py"
SC="static_gpubox_trainer.py"

# run pserver
export TRAINING_ROLE=PSERVER
for((i=0;i<$PADDLE_PSERVER_NUMS;i++))
do
cur_port=${PADDLE_PSERVER_PORT_ARRAY[$i]}
echo "PADDLE WILL START PSERVER "$cur_port
export PADDLE_PORT=${cur_port}
python3.7 -u $SC &> ./log/pserver.$i.log &
done

# run trainer
export TRAINING_ROLE=TRAINER
for((i=0;i<$PADDLE_TRAINERS;i++))
do
echo "PADDLE WILL START Trainer "$i
export PADDLE_TRAINER_ID=$i
python3.7 -u $SC &> ./log/worker.$i.log
done

echo "Training log stored in ./log/"
Loading