|
| 1 | +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +from tqdm import tqdm |
| 16 | +import random |
| 17 | + |
| 18 | + |
| 19 | +def preprocess(): |
| 20 | + ## preprocess origin dataset, filter out the required parameters |
| 21 | + ## The parameters needed is as below: |
| 22 | + ## click - 0 |
| 23 | + ## adID - 3 |
| 24 | + ## advertiseID => cate? - 4 |
| 25 | + ## position - 6 |
| 26 | + ## userID - -1 |
| 27 | + ad_id_dict = {} |
| 28 | + advertise_id_dict = {} |
| 29 | + cnt_ad = 1 |
| 30 | + cnt_cate = 1 |
| 31 | + fi = open("training.txt", "r") |
| 32 | + fo = open("train_phrase1.txt", "w") |
| 33 | + # All data: 149639106 |
| 34 | + print("Step 1. Preprocess origin dataset...") |
| 35 | + for i in tqdm(range(149639106)): |
| 36 | + line = fi.readline() |
| 37 | + if not line: |
| 38 | + break |
| 39 | + else: |
| 40 | + line = line.strip('\n') |
| 41 | + line = line.split("\t") |
| 42 | + ## if click not in [0, 1], then change it to 1 |
| 43 | + ## because 'click' maybe greater than 1 |
| 44 | + if line[0] != '0' and line[0] != '1': |
| 45 | + line[0] = '1' |
| 46 | + ## The parameters adID and advertiseID are needed to be remarked |
| 47 | + ad_id = line[3] |
| 48 | + advertise_id = line[4] |
| 49 | + if ad_id_dict.get(ad_id, -1) == -1: |
| 50 | + ad_id_dict[ad_id] = cnt_ad |
| 51 | + ad_id = cnt_ad |
| 52 | + cnt_ad += 1 |
| 53 | + else: |
| 54 | + ad_id = ad_id_dict.get(ad_id) |
| 55 | + if advertise_id_dict.get(advertise_id, -1) == -1: |
| 56 | + advertise_id_dict[advertise_id] = cnt_cate |
| 57 | + advertise_id = cnt_cate |
| 58 | + cnt_cate += 1 |
| 59 | + else: |
| 60 | + advertise_id = advertise_id_dict.get(advertise_id) |
| 61 | + fo.writelines("{};{};{};{};{}\n".format(line[ |
| 62 | + -1], ad_id, advertise_id, line[6], line[0])) |
| 63 | + fi.close() |
| 64 | + fo.close() |
| 65 | + |
| 66 | + |
| 67 | +def gen_DIN(num=5000000): |
| 68 | + #$ Step 2. Generate dataset for model DIN(optional) |
| 69 | + fi = open("train_phrase1.txt", "r") |
| 70 | + fo = open("train_din_tmp.txt", "w") |
| 71 | + user_id = "" |
| 72 | + max_ad = 0 |
| 73 | + max_context = 0 |
| 74 | + his_ad = [] |
| 75 | + his_cat = [] |
| 76 | + cnt_total = 0 |
| 77 | + print("Step 2.1 Generate dataset for model DIN(optional)...") |
| 78 | + for i in tqdm(range(num)): |
| 79 | + line = fi.readline() |
| 80 | + if not line: |
| 81 | + break |
| 82 | + else: |
| 83 | + line = line.strip('\n') |
| 84 | + line = line.split(";") |
| 85 | + max_ad = max(max_ad, int(line[1])) |
| 86 | + max_context = max(max_context, int(line[2])) |
| 87 | + if user_id != line[0]: |
| 88 | + # reset |
| 89 | + his_ad.clear() |
| 90 | + his_cat.clear() |
| 91 | + user_id = line[0] |
| 92 | + else: |
| 93 | + ## If click = 1, then append the data to his. |
| 94 | + ## If his is not null, then add positive cases. |
| 95 | + if line[4] == '1': |
| 96 | + if len(his_ad) != 0: |
| 97 | + cnt_total += 1 |
| 98 | + fo.writelines("{};{};{};{};{};{}\n".format(' '.join( |
| 99 | + his_ad), ' '.join(his_cat), line[1], line[2], line[ |
| 100 | + 3], line[4])) |
| 101 | + his_ad.append(line[1]) |
| 102 | + his_cat.append(line[2]) |
| 103 | + ## If click = 0 and his is not null, then add negative cases. |
| 104 | + if line[4] == '0': |
| 105 | + if len(his_ad) != 0: |
| 106 | + cnt_total += 1 |
| 107 | + fo.writelines("{};{};{};{};{};{}\n".format(' '.join( |
| 108 | + his_ad), ' '.join(his_cat), line[1], line[2], line[ |
| 109 | + 3], line[4])) |
| 110 | + fi.close() |
| 111 | + fo.close() |
| 112 | + print("Total dataset lines : ", cnt_total) |
| 113 | + print("Step 2.2 Partitioning the dataset for model DIN...") |
| 114 | + fi = open("train_din_tmp.txt", "r") |
| 115 | + fo_train = open("train_din.txt", "w") |
| 116 | + fo_test = open("test_din.txt", "w") |
| 117 | + cnt_train = 0 |
| 118 | + cnt_test = 0 |
| 119 | + random.seed(2022) |
| 120 | + train_set = [] |
| 121 | + test_set = [] |
| 122 | + for i in tqdm(range(cnt_total)): |
| 123 | + line = fi.readline() |
| 124 | + if random.random() <= 0.2: |
| 125 | + cnt_test += 1 |
| 126 | + test_set.append(line) |
| 127 | + else: |
| 128 | + cnt_train += 1 |
| 129 | + train_set.append(line) |
| 130 | + ## shuffle |
| 131 | + print("Step 2.3 Shuffling the dataset...") |
| 132 | + random.shuffle(train_set) |
| 133 | + random.shuffle(test_set) |
| 134 | + ## save to file |
| 135 | + print("Step 2.4 Saving to file...") |
| 136 | + for _, line in enumerate(tqdm(train_set)): |
| 137 | + fo_train.writelines(line) |
| 138 | + for _, line in enumerate(tqdm(test_set)): |
| 139 | + fo_test.writelines(line) |
| 140 | + print("Train dataset lines : ", cnt_train) |
| 141 | + print("Test dataset lines : ", cnt_test) |
| 142 | + print("---" * 20) |
| 143 | + print( |
| 144 | + "Please remember the result as is shown below, \nyou have to copy them to file 'config.yaml' or 'config_bigdata.yaml'" |
| 145 | + ) |
| 146 | + print("max_item", max_ad) |
| 147 | + print("max_context", max_context) |
| 148 | + print("---" * 20) |
| 149 | + |
| 150 | + |
| 151 | +def gen_DPIN(num=5000000): |
| 152 | + ## Step 3. Generate dataset for model DPIN |
| 153 | + fi = open("train_phrase1.txt", "r") |
| 154 | + fo = open("train_dpin_tmp.txt", "w") |
| 155 | + user_id = "" |
| 156 | + max_ad = 0 |
| 157 | + max_context = 0 |
| 158 | + his_ad = [] |
| 159 | + his_cat = [] |
| 160 | + his_pos = [] |
| 161 | + cnt_total = 0 |
| 162 | + print("Step 3.1 Generating dataset for model DPIN...") |
| 163 | + for i in tqdm(range(num)): |
| 164 | + line = fi.readline() |
| 165 | + if not line: |
| 166 | + break |
| 167 | + else: |
| 168 | + line = line.strip('\n') |
| 169 | + line = line.split(";") |
| 170 | + max_ad = max(max_ad, int(line[1])) |
| 171 | + max_context = max(max_context, int(line[2])) |
| 172 | + if user_id != line[0]: |
| 173 | + # reset |
| 174 | + his_ad.clear() |
| 175 | + his_cat.clear() |
| 176 | + his_pos.clear() |
| 177 | + user_id = line[0] |
| 178 | + else: |
| 179 | + ## If click = 1, then append the data to his. |
| 180 | + ## If his is not null, then add positive cases. |
| 181 | + if line[4] == '1': |
| 182 | + if len(his_ad) != 0: |
| 183 | + cnt_total += 1 |
| 184 | + fo.writelines("{};{};{};{};{};{};{}\n".format(' '.join( |
| 185 | + his_ad), ' '.join(his_cat), ' '.join( |
| 186 | + his_pos), line[1], line[2], line[3], line[4])) |
| 187 | + his_ad.append(line[1]) |
| 188 | + his_cat.append(line[2]) |
| 189 | + his_pos.append(line[3]) |
| 190 | + ## If click = 0 and his is not null, then add negative cases. |
| 191 | + if line[4] == '0': |
| 192 | + if len(his_ad) != 0: |
| 193 | + cnt_total += 1 |
| 194 | + fo.writelines("{};{};{};{};{};{};{}\n".format(' '.join( |
| 195 | + his_ad), ' '.join(his_cat), ' '.join( |
| 196 | + his_pos), line[1], line[2], line[3], line[4])) |
| 197 | + fi.close() |
| 198 | + fo.close() |
| 199 | + print("Total dataset lines : ", cnt_total) |
| 200 | + print("Step 3.2 Partitioning the dataset for model DPIN...") |
| 201 | + fi = open("train_dpin_tmp.txt", "r") |
| 202 | + fo_train = open("train_dpin.txt", "w") |
| 203 | + fo_test = open("test_dpin.txt", "w") |
| 204 | + cnt_train = 0 |
| 205 | + cnt_test = 0 |
| 206 | + random.seed(2022) |
| 207 | + train_set = [] |
| 208 | + test_set = [] |
| 209 | + for i in tqdm(range(cnt_total)): |
| 210 | + line = fi.readline() |
| 211 | + if random.random() <= 0.2: |
| 212 | + cnt_test += 1 |
| 213 | + test_set.append(line) |
| 214 | + else: |
| 215 | + cnt_train += 1 |
| 216 | + train_set.append(line) |
| 217 | + ## shuffle |
| 218 | + print("Step 3.3 Shuffling the dataset...") |
| 219 | + random.shuffle(train_set) |
| 220 | + random.shuffle(test_set) |
| 221 | + ## save to file |
| 222 | + print("Step 3.4 Saving to file...") |
| 223 | + for _, line in enumerate(tqdm(train_set)): |
| 224 | + fo_train.writelines(line) |
| 225 | + for _, line in enumerate(tqdm(test_set)): |
| 226 | + fo_test.writelines(line) |
| 227 | + print("Train dataset lines : ", cnt_train) |
| 228 | + print("Test dataset lines : ", cnt_test) |
| 229 | + print("---" * 20) |
| 230 | + print( |
| 231 | + "Please remember the result as is shown below, \nyou have to copy them to file 'config.yaml' or 'config_bigdata.yaml'" |
| 232 | + ) |
| 233 | + print("max_item", max_ad) |
| 234 | + print("max_context", max_context) |
| 235 | + print("---" * 20) |
| 236 | + |
| 237 | + |
| 238 | +if __name__ == '__main__': |
| 239 | + # Step 1. Preprocess the data download from KDD Cup 2012, Track 2 |
| 240 | + # !!!! Please make sure that you have downloaded the data from Kaggle !!! |
| 241 | + # The data input is stored at 'training.txt' |
| 242 | + # The processed data will be stored at 'train_phrase1.txt' |
| 243 | + preprocess() |
| 244 | + |
| 245 | + # Step 2. Generate dataset for model DIN(optional) |
| 246 | + # The processed data will be stored at 'train_din.txt' |
| 247 | + # gen_DIN(50000000) |
| 248 | + |
| 249 | + # Step 3. Generate dataset for model DPIN |
| 250 | + # The processed data will be stored at 'train_dpin.txt' |
| 251 | + gen_DPIN(50000000) |
0 commit comments