|
| 1 | +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +import pandas as pd |
| 16 | +import argparse |
| 17 | + |
| 18 | + |
| 19 | +def gen_heter_files(data_load_path, splitted_data_path, file_nums): |
| 20 | + data = pd.read_csv(data_load_path) |
| 21 | + total_sample_num = data.shape[0] |
| 22 | + print("total sample num is: {}".format(total_sample_num)) |
| 23 | + sample_num_per_file = int(total_sample_num / file_nums) |
| 24 | + for i in range(0, file_nums): |
| 25 | + save_data = data.iloc[i * sample_num_per_file + 1:(i + 1) * |
| 26 | + sample_num_per_file + 1] |
| 27 | + file_name = splitted_data_path + '/' + str(i) + '.csv' |
| 28 | + save_data.to_csv(file_name, index=False) |
| 29 | + print("files splitted done, num is {}, saved in path: {}".format( |
| 30 | + file_nums, splitted_data_path)) |
| 31 | + |
| 32 | + |
| 33 | +def get_zipcode_dict(): |
| 34 | + filename = '/home/wangbin/the_one_ps/ziyoujiyi_PaddleRec/MovieLens-1M/ml-1m/users.dat' |
| 35 | + zipcode_dict = {} |
| 36 | + with open(filename, "r") as f: |
| 37 | + line = f.readline() |
| 38 | + while line != None and line != "": |
| 39 | + arr = line.split("::") |
| 40 | + user_id, sex, age, occupation, zip_code = int(arr[0]), str(arr[ |
| 41 | + 1]), int(arr[2]), int(arr[3]), str(arr[4]) |
| 42 | + zip_code = int(zip_code[0:5]) |
| 43 | + zipcode_dict[user_id] = zip_code |
| 44 | + line = f.readline() |
| 45 | + return zipcode_dict |
| 46 | + |
| 47 | + |
| 48 | +def shuffle_data_by_zipcode(data_load_path, splitted_data_path, file_nums, |
| 49 | + zipcode_dict): |
| 50 | + data = pd.read_csv(data_load_path) |
| 51 | + total_sample_num = data.shape[0] |
| 52 | + print("total sample num is: {}".format(total_sample_num)) |
| 53 | + data_list = data.values.tolist() |
| 54 | + sharded_data = [(idx, []) for idx in range(10)] |
| 55 | + for data_row in data_list: |
| 56 | + user_id = data_row[0] |
| 57 | + zipcode = zipcode_dict[user_id + 1] |
| 58 | + shard_id = int(zipcode / 10000) |
| 59 | + sharded_data[shard_id][1].extend([data_row]) |
| 60 | + for (shard_id, sample) in sharded_data: |
| 61 | + print("zipcode start with {}: {}".format(shard_id, len(sample))) |
| 62 | + file_name = splitted_data_path + '/' + str(shard_id) + '.csv' |
| 63 | + d = pd.DataFrame(data=sample) |
| 64 | + d.to_csv(file_name, index=False) |
| 65 | + print("files splitted by zipcode done, saved in path: {}".format( |
| 66 | + splitted_data_path)) |
| 67 | + |
| 68 | + |
| 69 | +def parse_args(): |
| 70 | + parser = argparse.ArgumentParser(description="Run GMF.") |
| 71 | + parser.add_argument( |
| 72 | + '--full_train_data_path', |
| 73 | + type=str, |
| 74 | + default="../big_train/train_data.csv", |
| 75 | + help='full_train_data_path') |
| 76 | + parser.add_argument( |
| 77 | + '--splitted_data_path', |
| 78 | + type=str, |
| 79 | + default="fl_train_data", |
| 80 | + help='splitted_data_path') |
| 81 | + parser.add_argument( |
| 82 | + '--file_nums', type=int, default='10', help='fl clients num') |
| 83 | + return parser.parse_args() |
| 84 | + |
| 85 | + |
| 86 | +if __name__ == '__main__': |
| 87 | + args = parse_args() |
| 88 | + #gen_heter_files(args.full_train_data_path, args.splitted_data_path, args.file_nums) |
| 89 | + zipcode_dict = get_zipcode_dict() |
| 90 | + shuffle_data_by_zipcode(args.full_train_data_path, args.splitted_data_path, |
| 91 | + args.file_nums, zipcode_dict) |
0 commit comments