1
1
import re
2
+ import json
2
3
3
4
from bs4 import BeautifulSoup
4
5
5
6
from ..user import public
6
7
from decorators import parse_decorator
7
- from db .models import User
8
+ from db .models import (User , UserRelation )
9
+ from db .dao import UserRelationOper
8
10
9
11
10
12
@parse_decorator (0 )
@@ -90,21 +92,21 @@ def get_detail(html, uid):
90
92
user .description = description .encode ('gbk' , 'ignore' ).decode ('gbk' )
91
93
elif '注册时间:' in each_str :
92
94
user .register_time = each .find (attrs = {'class' : 'pt_detail' }).get_text ().replace ('\t ' , '' ).replace (
93
- '\r \n ' , '' )
95
+ '\r \n ' , '' ). replace ( ' ' , '' )
94
96
95
97
if '标签信息' in basic_str :
96
98
basic_info = each_module .find_all (attrs = {'class' : 'li_1 clearfix' })
97
99
for each in basic_info :
98
100
if '标签:' in each .get_text ():
99
101
user .tags = each .find (attrs = {'class' : 'pt_detail' }).get_text ().replace ('\t ' , '' ).replace (
100
- '\n \n \n ' , '' ) .strip ().replace ('\r \n ' , ';' )
102
+ '\n \n \n ' , '' ) .strip ().replace ('\r \n ' , ';' ). replace ( ' ' , '' )
101
103
102
104
if '教育信息' in basic_str :
103
105
basic_info = each_module .find_all (attrs = {'class' : 'li_1 clearfix' })
104
106
for each in basic_info :
105
107
if '大学:' in each .get_text ():
106
108
user .education_info = each .find (attrs = {'class' : 'pt_detail' }).get_text ().replace ('\r \n ' , ',' ) \
107
- .replace ('\t ' , '' ).replace ('\n ' , ';' ).lstrip (';' ).rstrip (';' )
109
+ .replace ('\t ' , '' ).replace ('\n ' , ';' ).lstrip (';' ).rstrip (';' ). replace ( ' ' , '' )
108
110
109
111
if '工作信息' in basic_str :
110
112
basic_info = each_module .find_all (attrs = {'class' : 'li_1 clearfix' })
@@ -114,7 +116,7 @@ def get_detail(html, uid):
114
116
jobs = each .find_all (attrs = {'class' : 'pt_detail' })
115
117
for job in jobs :
116
118
jobs_info .append (job .get_text ().replace ('\r \n ' , '' ).replace ('\t ' , '' ).replace ('\n ' , '' ))
117
- user .work_info = ';' .join (jobs_info )
119
+ user .work_info = ';' .join (jobs_info ). replace ( ' ' , '' )
118
120
119
121
if '联系信息' in basic_str :
120
122
basic_info = each_module .find_all (attrs = {'class' : 'li_1 clearfix' })
@@ -127,26 +129,56 @@ def get_detail(html, uid):
127
129
contact_info .append ('email:' + each .find (attrs = {'class' : 'pt_detail' }).get_text ())
128
130
if 'MSN:' in each .get_text ():
129
131
contact_info .append ('msn:' + each .find (attrs = {'class' : 'pt_detail' }).get_text ())
130
- user .contact_info = ';' .join (contact_info )
132
+ user .contact_info = ';' .join (contact_info ). replace ( ' ' , '' )
131
133
except Exception as why :
132
134
print ('解析出错,具体原因为{why}' .format (why = why ))
133
135
134
136
return user
135
137
136
138
137
139
@parse_decorator (None )
138
- def get_isFan (html , uid ):
140
+ def get_isFan (html , uids , current_uid ):
139
141
"""
140
142
:param html: samefollow page
141
- :param uid : whether this account follows uid
143
+ :param uids: list contains uids to determine this account follows or not
144
+ :param current_uid: current crawling user
142
145
:return: 1 for yes 0 for no
143
146
"""
144
147
soup = BeautifulSoup (html , "html.parser" )
145
148
scripts = soup .find_all ('script' )
146
149
pattern = re .compile (r'FM.view\((.*)\)' )
147
150
151
+ user_ids = list () # Contains uids that the user and crawler both follow
152
+ intersection_ids = list () # Contains the intersection of param uids and user_ids
153
+ relations = list () # Contains list to be stored in UserRelation table
148
154
for script in scripts :
149
155
m = pattern .search (script .string )
150
- if m and uid in script .string :
151
- return 1
152
- return 0
156
+ # Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
157
+ if m and 'pl.content.followTab.index' in script .string :
158
+ all_info = m .group (1 )
159
+ cont = json .loads (all_info ).get ('html' , '' )
160
+ soup = BeautifulSoup (cont , 'html.parser' )
161
+ follows = soup .find (attrs = {'class' : 'follow_box' }).find_all (attrs = {'class' : 'follow_item' })
162
+ patternUID = re .compile (r'uid=(.*?)&' )
163
+ for follow in follows :
164
+ m = re .search (patternUID , str (follow ))
165
+ if m :
166
+ r = m .group (1 )
167
+ # filter invalid ids
168
+ if r .isdigit ():
169
+ user_ids .append (r )
170
+ # Most the same with def get_fans_or_follows(html, uid, type):
171
+ # Except the following lines calculate which uids do the user follow
172
+ intersection_ids = list (set (user_ids ).intersection (set (uids )))
173
+ # Now store in the database
174
+ type = 1
175
+ n = None
176
+ for uid in intersection_ids :
177
+ relations .append (UserRelation (uid , current_uid , type , n , False ))
178
+ UserRelationOper .add_all (relations )
179
+ break
180
+ # legacy support
181
+ if intersection_ids :
182
+ return 1
183
+ else :
184
+ return 0
0 commit comments