Merge pull request #811 from ziyoujiyi/fl-rec

frankwhzhang · web-flow · commit a9bb9918bf13 · 2022-09-06T14:01:47.000+08:00
add ncf fl-trainer
diff --git a/datasets/movielens_pinterest_NCF/fl_data/gen_heter_data.py b/datasets/movielens_pinterest_NCF/fl_data/gen_heter_data.py
@@ -0,0 +1,91 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pandas as pd
+import argparse
+
+
+def gen_heter_files(data_load_path, splitted_data_path, file_nums):
+    data = pd.read_csv(data_load_path)
+    total_sample_num = data.shape[0]
+    print("total sample num is: {}".format(total_sample_num))
+    sample_num_per_file = int(total_sample_num / file_nums)
+    for i in range(0, file_nums):
+        save_data = data.iloc[i * sample_num_per_file + 1:(i + 1) *
+                              sample_num_per_file + 1]
+        file_name = splitted_data_path + '/' + str(i) + '.csv'
+        save_data.to_csv(file_name, index=False)
+    print("files splitted done, num is {}, saved in path: {}".format(
+        file_nums, splitted_data_path))
+
+
+def get_zipcode_dict():
+    filename = '/home/wangbin/the_one_ps/ziyoujiyi_PaddleRec/MovieLens-1M/ml-1m/users.dat'
+    zipcode_dict = {}
+    with open(filename, "r") as f:
+        line = f.readline()
+        while line != None and line != "":
+            arr = line.split("::")
+            user_id, sex, age, occupation, zip_code = int(arr[0]), str(arr[
+                1]), int(arr[2]), int(arr[3]), str(arr[4])
+            zip_code = int(zip_code[0:5])
+            zipcode_dict[user_id] = zip_code
+            line = f.readline()
+    return zipcode_dict
+
+
+def shuffle_data_by_zipcode(data_load_path, splitted_data_path, file_nums,
+                            zipcode_dict):
+    data = pd.read_csv(data_load_path)
+    total_sample_num = data.shape[0]
+    print("total sample num is: {}".format(total_sample_num))
+    data_list = data.values.tolist()
+    sharded_data = [(idx, []) for idx in range(10)]
+    for data_row in data_list:
+        user_id = data_row[0]
+        zipcode = zipcode_dict[user_id + 1]
+        shard_id = int(zipcode / 10000)
+        sharded_data[shard_id][1].extend([data_row])
+    for (shard_id, sample) in sharded_data:
+        print("zipcode start with {}: {}".format(shard_id, len(sample)))
+        file_name = splitted_data_path + '/' + str(shard_id) + '.csv'
+        d = pd.DataFrame(data=sample)
+        d.to_csv(file_name, index=False)
+    print("files splitted by zipcode done, saved in path: {}".format(
+        splitted_data_path))
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description="Run GMF.")
+    parser.add_argument(
+        '--full_train_data_path',
+        type=str,
+        default="../big_train/train_data.csv",
+        help='full_train_data_path')
+    parser.add_argument(
+        '--splitted_data_path',
+        type=str,
+        default="fl_train_data",
+        help='splitted_data_path')
+    parser.add_argument(
+        '--file_nums', type=int, default='10', help='fl clients num')
+    return parser.parse_args()
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    #gen_heter_files(args.full_train_data_path, args.splitted_data_path, args.file_nums)
+    zipcode_dict = get_zipcode_dict()
+    shuffle_data_by_zipcode(args.full_train_data_path, args.splitted_data_path,
+                            args.file_nums, zipcode_dict)
diff --git a/models/rank/dnn/config.yaml b/models/rank/dnn/config.yaml
@@ -37,6 +37,9 @@ runner:
   sync_mode: "async"
   split_file_list: False
   thread_num: 1
+  reader_type: "QueueDataset" # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python queuedataset_reader.py" # QueueDataset 模式下的数据pipe命令
+  dataset_debug: False # QueueDataset 模式下 Profiler 开关
 
 
 # hyper parameters of user-defined network
diff --git a/models/rank/slot_dnn/config_queuedataset.yaml b/models/rank/slot_dnn/config_queuedataset.yaml
@@ -19,6 +19,7 @@
 runner:
   train_data_dir: "data/"
   train_reader_path: "criteo_reader" # importlib format
+  sync_mode: "async"
   use_gpu: False
   use_auc: True
   train_batch_size: 2
diff --git a/models/recall/ncf/config_fl.yaml b/models/recall/ncf/config_fl.yaml
@@ -0,0 +1,58 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+runner:
+  sync_mode: "geo" # 可选, string: sync/async/geo
+  #with_coodinator: 1
+  geo_step: 100 # 可选, int, 在geo模式下控制本地的迭代次数
+  split_file_list: True # 可选, bool, 若每个节点上都拥有全量数据，则需设置为True
+  thread_num: 1 # 多线程配置
+
+  # reader类型，分布式下推荐QueueDataset
+  reader_type: "QueueDataset" # DataLoader / QueueDataset / RecDataset
+  pipe_command: "python queuedataset_reader.py" # QueueDataset 模式下的数据pipe命令
+  dataset_debug: False # QueueDataset 模式下 Profiler 开关
+
+  train_data_dir: "../../../datasets/movielens_pinterest_NCF/fl_data/fl_train_data"
+  train_reader_path: "movielens_reader"  # importlib format
+  train_batch_size: 512
+  model_save_path: "output_model_ncf"
+
+  use_gpu: False
+  epochs: 2
+  print_interval: 50
+  
+  test_data_dir: "../../../datasets/movielens_pinterest_NCF/fl_data/fl_test_data"
+  infer_reader_path: "movielens_reader"  # importlib format
+  infer_batch_size: 1
+  infer_load_path: "output_model_ncf"
+  infer_start_epoch: 2
+  infer_end_epoch: 3
+  
+  need_dump: True
+  dump_fields_path: "/home/wangbin/the_one_ps/ziyoujiyi_PaddleRec/PaddleRec/models/recall/ncf"
+  dump_fields: ['item_input', 'user_input']
+  dump_param: []
+  local_sparse: ['embedding_0.w_0']
+  remote_sparse: ['embedding_1.w_0']
+
+hyper_parameters:
+  optimizer: 
+    class: adam
+    learning_rate: 0.001
+  num_users: 6040
+  num_items: 3706
+  mf_dim: 8
+  mode: "NCF_MLP"  # optional: NCF_NeuMF, NCF_GMF, NCF_MLP
+  fc_layers: [64, 32, 16, 8]
diff --git a/models/recall/ncf/fl_ps_help.md b/models/recall/ncf/fl_ps_help.md
@@ -0,0 +1,58 @@
+# 1、功能介绍
+基于 GEO-PS 实现的 FL-PS，支持 Coordinator：  
+* 构造 worker 上异构样本数据
+* 每一轮训练（Epoch）时，打印训练指标（loss、auc）
+* 每一轮训练之后，用测试集数据推理
+
+# 2、样本准备
+* 在 PaddleRec/datasets/movielens_pinterest_NCF 目录中执行: sh run.sh，获取初步处理过的训练数据（big_train）和测试数据（test_data）
+* 从 MovieLens 官网下载 ml-1m 数据集，获取 user.dat 文件（可自定义存储路径，但需要和 gen_heter_data.py 脚本中路径保持一致），后续用于构造异构数据集（按 zipcode 的首位数字划分）
+* 在 PaddleRec/datasets/movielens_pinterest_NCF/fl_data 中新建目录 fl_test_data 和 fl_train_data，用于存放每个 client 上的训练数据集和测试数据集
+* 在 PaddleRec/datasets/movielens_pinterest_NCF/fl_data 目录中执行: python gen_heter_data.py，生成 10 份训练数据
+    * 总样本数 4970844（按 1:4 补充负样本）：0 - 518095，1 - 520165，2 - 373605，3 - 315550，4 - 483779，5 - 495635，6 - 402810，7 - 354590，8 - 262710，9 - 1243905
+    * 样本数据每一行表示：物品 id，用户 id，标签
+    
+# 3、运行命令
+1. 不带 coordinator 版本
+* 在本文件所在的目录下执行：fleetrun --worker_num=10 --server_num=1 ../../../tools/static_fl_trainer.py -m config_fl.yaml
+2. 带 coordinator 版本
+* 在本文件所在的目录下执行：fleetrun --worker_num=10 --server_num=1 --coordinator_num=1 ../../../tools/static_fl_trainer.py -m config_fl.yaml
+（可参考 fl_run.sh 文件）
+
+# 4、二次开发
+## 系统层面
+1. 代码 repo
+* Paddle: https://github.com/ziyoujiyi/Paddle/tree/fl_ps
+* PaddleRec：https://github.com/ziyoujiyi/PaddleRec/tree/fl-rec
+2. 编译、安装
+```
+1）去 https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/develop/install/compile/linux-compile.html 找到 develop/Linux/源码编译/CPU/ 的开发镜像，在 docker 中开发
+2）在 Paddle 根目录下，新建 build 目录
+3）cd build
+4）cmake .. -DPY_VERSION=3.7 -DWITH_GPU=OFF -DCMAKE_BUILD_TYPE=Release -DWITH_DISTRIBUTE=ON -DWITH_PSCORE=ON -WITH_AVX=OFF -DWITH_TESTING=OFF -DWITH_FLPS=ON
+5) make -j 
+6）python -m pip install python/dist/paddlepaddle-0.0.0-cp37-cp37m-linux_x86_64.whl -U
+```
+3. 用户二次开发模块
+* Paddle：
+    * Paddle/python/paddle/distributed/ps/coordinator.py
+    * 模型组网文件参考：PaddleRec/models/recall/ncf/net.py，用户如果新增组网文件，用前缀 "fl_" 标识
+    * 数据集：如果 PaddleRec 中已经有的，直接在对应目录下新增 fl_test_data 和 fl_test_train 目录；如果 PaddleRec 中没有，用户在 PaddleRec/datasets 中新增
+    * 用户自定义异构数据集构造，参考 gen_heter_data.py
+    * 构造模型输入请参考：PaddleRec/models/recall/ncf/queuedataset_reader.py
+4. 编码规范
+* 风格检查：pip install pre-commit && 在 git 根目录下执行：pre-commit install
+* 遵循已有风格
+
+
+## 策略层面
+1. 边缘任务调度策略
+* 用户组网：DDPG
+* 用户 python 端调用 _pull_dense 接口从 ps 端拉取 dense 参数，然后从 scope 里读
+* 用户确定每轮训练之后 client 需要上传给 coordinator 的各个参数（字段）
+2. 新的损失函数设计
+* 直接使用不带 coordinator 版本的训练脚本
+3. 知识蒸馏
+* 用户训练 student 模型，打印 logits 结果，并上传到 coordinator，coordinator 端进行 teacher 模型训练
+* coordinator 下发全局软目标
+4. 模型压缩
diff --git a/models/recall/ncf/fl_run.sh b/models/recall/ncf/fl_run.sh
@@ -0,0 +1,5 @@
+ps -ef | grep python | awk '{print $2}' | xargs kill -9
+
+#fleetrun --worker_num=10 --server_num=1 ../../../tools/static_fl_trainer.py -m config_fl.yaml
+#fleetrun --worker_num=10 --server_num=1 --coordinator_num=1 ../../../tools/static_fl_trainer.py -m config_fl.yaml
+fleetrun --worker_num=10 --workers="127.0.0.1:9000,127.0.0.1:9001,127.0.0.1:9002,127.0.0.1:9003,127.0.0.1:9004,127.0.0.1:9005,127.0.0.1:9006,127.0.0.1:9007,127.0.0.1:9008,127.0.0.1:9009" --server_num=1 --servers="127.0.0.1:10000" --coordinator_num=1 --coordinators="127.0.0.1:10001" ../../../tools/static_fl_trainer.py -m config_fl.yaml
diff --git a/models/recall/ncf/queuedataset_reader.py b/models/recall/ncf/queuedataset_reader.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import yaml
+import six
+import os
+import copy
+import paddle.distributed.fleet as fleet
+import logging
+import numpy as np
+
+logging.basicConfig(
+    format='%(asctime)s - %(levelname)s - %(message)s', level=logging.INFO)
+logger = logging.getLogger(__name__)
+
+
+class Reader(fleet.MultiSlotDataGenerator):
+    def init(self, config):
+        self.slots = ['user_id', 'item_id', 'label']
+        logger.info("pipe init success")
+
+    def line_process(self, line):
+        features = line.strip().split(',')
+        user_input = [int(features[0])]
+        item_input = [int(features[1])]
+        label = [int(features[2])]
+        output_list = [(i, []) for i in self.slots]
+        output_list[0][1].extend(user_input)
+        output_list[1][1].extend(item_input)
+        output_list[2][1].extend(label)
+        return output_list
+
+    def generate_sample(self, line):
+        r"Dataset Generator"
+
+        def reader():
+            output_dict = self.line_process(line)
+            yield output_dict
+
+        return reader
+
+
+if __name__ == "__main__":
+    yaml_path = sys.argv[1]
+    utils_path = sys.argv[2]
+    sys.path.append(utils_path)
+    import common
+    yaml_helper = common.YamlHelper()
+    config = yaml_helper.load_yaml(yaml_path)
+
+    r = Reader()
+    r.init(config)
+    r.run_from_stdin()
diff --git a/models/recall/ncf/static_model.py b/models/recall/ncf/static_model.py
@@ -56,13 +56,22 @@ def net(self, input, is_infer=False):
                                       self.mf_dim, self.layers)
 
         prediction = ncf_model.forward(input)
+        predict_2d = paddle.concat(x=[1 - prediction, prediction], axis=1)
+        label_input = input[2]
+
+        auc, batch_auc, [batch_stat_pos, batch_stat_neg, stat_pos,
+                         stat_neg] = paddle.static.auc(input=predict_2d,
+                                                       label=label_input,
+                                                       num_thresholds=2**12,
+                                                       slide_steps=0)
 
         self.inference_target_var = prediction
         if is_infer:
             fetch_dict = {
                 "user": input[0],
                 'prediction': prediction,
-                "label": input[2]
+                "label": input[2],
+                'auc': auc
             }
             return fetch_dict
         cost = F.log_loss(
@@ -71,7 +80,7 @@ def net(self, input, is_infer=False):
         avg_cost = paddle.mean(x=cost)
         # print(avg_cost)
         self._cost = avg_cost
-        fetch_dict = {'Loss': avg_cost}
+        fetch_dict = {'Loss': avg_cost, 'Auc': auc}
         return fetch_dict
 
     def create_optimizer(self, strategy=None):
diff --git a/tools/static_fl_trainer.py b/tools/static_fl_trainer.py
diff --git a/tools/utils/static_ps/program_helper.py b/tools/utils/static_ps/program_helper.py
diff --git a/tools/utils/static_ps/reader_helper.py b/tools/utils/static_ps/reader_helper.py