|
| 1 | +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +""" |
| 15 | +Data pre-processing for metapath2vec model. |
| 16 | +""" |
| 17 | + |
| 18 | +import os |
| 19 | +import sys |
| 20 | +import tqdm |
| 21 | +import time |
| 22 | +import logging |
| 23 | +import random |
| 24 | +import argparse |
| 25 | +import numpy as np |
| 26 | +import pickle as pkl |
| 27 | + |
| 28 | +from pgl.utils.logger import log |
| 29 | +from utils.config import prepare_config, make_dir |
| 30 | + |
| 31 | +# name ID g_index |
| 32 | + |
| 33 | + |
| 34 | +def remapping_id(file_, start_index, node_type, separator="\t"): |
| 35 | + """Mapp the ID and name of nodes to index. |
| 36 | + """ |
| 37 | + node_types = [] |
| 38 | + id2index = {} |
| 39 | + name2index = {} |
| 40 | + index = start_index |
| 41 | + with open(file_, encoding="ISO-8859-1") as reader: |
| 42 | + for line in reader: |
| 43 | + tokens = line.strip().split(separator) |
| 44 | + id2index[tokens[0]] = str(index) |
| 45 | + if len(tokens) == 2: |
| 46 | + name2index[tokens[1]] = str(index) |
| 47 | + node_types.append((str(index), node_type)) |
| 48 | + index += 1 |
| 49 | + |
| 50 | + return id2index, name2index, node_types |
| 51 | + |
| 52 | + |
| 53 | +def load_edges(file_, src2index, dst2index, symmetry=False): |
| 54 | + """Load edges from file. |
| 55 | + """ |
| 56 | + edges = [] |
| 57 | + with open(file_, 'r') as reader: |
| 58 | + for line in reader: |
| 59 | + items = line.strip().split() |
| 60 | + src, dst = src2index[items[0]], dst2index[items[1]] |
| 61 | + edges.append((src, dst)) |
| 62 | + if symmetry: |
| 63 | + edges.append((dst, src)) |
| 64 | + edges = list(set(edges)) |
| 65 | + return edges |
| 66 | + |
| 67 | + |
| 68 | +def load_label(file_, name2index): |
| 69 | + index_label = [] |
| 70 | + with open(file_, encoding="ISO-8859-1") as reader: |
| 71 | + for line in reader: |
| 72 | + tokens = line.strip().split(' ') |
| 73 | + name, label = tokens[0], int(tokens[1]) - 1 |
| 74 | + if name in name2index: |
| 75 | + index_label.append((name2index[name], str(label))) |
| 76 | + |
| 77 | + return index_label |
| 78 | + |
| 79 | + |
| 80 | +def main(config): |
| 81 | + conf_id2index, conf_name2index, conf_node_type = remapping_id( |
| 82 | + os.path.join(config.data_path, 'id_conf.txt'), |
| 83 | + start_index=0, |
| 84 | + node_type='c') |
| 85 | + log.info('%d venues have been loaded.' % (len(conf_id2index))) |
| 86 | + |
| 87 | + author_id2index, author_name2index, author_node_type = remapping_id( |
| 88 | + os.path.join(config.data_path, 'id_author.txt'), |
| 89 | + start_index=len(conf_id2index), |
| 90 | + node_type='a') |
| 91 | + log.info('%d authors have been loaded.' % (len(author_id2index))) |
| 92 | + |
| 93 | + paper_id2index, paper_name2index, paper_node_type = remapping_id( |
| 94 | + os.path.join(config.data_path, 'paper.txt'), |
| 95 | + start_index=(len(conf_id2index) + len(author_id2index)), |
| 96 | + node_type='p', |
| 97 | + separator='\t') |
| 98 | + log.info('%d papers have been loaded.' % (len(paper_id2index))) |
| 99 | + |
| 100 | + node_types = conf_node_type + author_node_type + paper_node_type |
| 101 | + |
| 102 | + paper2author_edges = load_edges( |
| 103 | + os.path.join(config.data_path, 'paper_author.txt'), paper_id2index, |
| 104 | + author_id2index) |
| 105 | + log.info('%d paper2author edges have been loaded.' % |
| 106 | + (len(paper2author_edges))) |
| 107 | + |
| 108 | + paper2conf_edges = load_edges( |
| 109 | + os.path.join(config.data_path, 'paper_conf.txt'), paper_id2index, |
| 110 | + conf_id2index) |
| 111 | + log.info('%d paper2conf edges have been loaded.' % (len(paper2conf_edges))) |
| 112 | + |
| 113 | + author_label = load_label(config.author_label_file, author_name2index) |
| 114 | + conf_label = load_label(config.venue_label_file, conf_name2index) |
| 115 | + |
| 116 | + make_dir(config.processed_path) |
| 117 | + node_types_file = os.path.join(config.processed_path, 'node_types.txt') |
| 118 | + log.info("saving node_types to %s" % node_types_file) |
| 119 | + with open(node_types_file, 'w') as writer: |
| 120 | + for item in tqdm.tqdm(node_types): |
| 121 | + writer.write("%s\t%s\n" % (item[1], item[0])) |
| 122 | + |
| 123 | + p2a_edges_file = os.path.join(config.processed_path, |
| 124 | + 'paper2author_edges.txt') |
| 125 | + log.info("saving paper2author edges to %s" % p2a_edges_file) |
| 126 | + with open(p2a_edges_file, 'w') as writer: |
| 127 | + for item in tqdm.tqdm(paper2author_edges): |
| 128 | + writer.write("\t".join(item) + "\n") |
| 129 | + |
| 130 | + p2c_edges_file = os.path.join(config.processed_path, |
| 131 | + 'paper2conf_edges.txt') |
| 132 | + log.info("saving paper2conf edges to %s" % p2c_edges_file) |
| 133 | + with open(p2c_edges_file, 'w') as writer: |
| 134 | + for item in tqdm.tqdm(paper2conf_edges): |
| 135 | + writer.write("\t".join(item) + "\n") |
| 136 | + |
| 137 | + author_label_file = os.path.join(config.processed_path, 'author_label.txt') |
| 138 | + log.info("saving author label to %s" % author_label_file) |
| 139 | + with open(author_label_file, 'w') as writer: |
| 140 | + for item in tqdm.tqdm(author_label): |
| 141 | + writer.write("\t".join(item) + "\n") |
| 142 | + |
| 143 | + conf_label_file = os.path.join(config.processed_path, 'conf_label.txt') |
| 144 | + log.info("saving conf label to %s" % conf_label_file) |
| 145 | + with open(conf_label_file, 'w') as writer: |
| 146 | + for item in tqdm.tqdm(conf_label): |
| 147 | + writer.write("\t".join(item) + "\n") |
| 148 | + |
| 149 | + |
| 150 | +if __name__ == "__main__": |
| 151 | + parser = argparse.ArgumentParser(description='metapath2vec') |
| 152 | + parser.add_argument('--config', default="./config.yaml", type=str) |
| 153 | + args = parser.parse_args() |
| 154 | + |
| 155 | + config = prepare_config(args.config) |
| 156 | + |
| 157 | + main(config) |
0 commit comments