Skip to content

Commit 76549ce

Browse files
authored
Merge pull request #96 from thekingofcity/master
samefollow now store in UserRelation
2 parents 2ebee1a + 3b396ea commit 76549ce

File tree

5 files changed

+54
-17
lines changed

5 files changed

+54
-17
lines changed

db/models.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -58,11 +58,15 @@ def __repr__(self):
5858
class UserRelation(Base):
5959
__table__ = user_relation
6060

61-
def __init__(self, uid, other_id, type, from_where):
61+
def __init__(self, uid, other_id, type, from_where, crawl_time=True):
6262
self.user_id = uid
6363
self.follow_or_fans_id = other_id
6464
self.type = type
6565
self.from_where = from_where
66+
if crawl_time:
67+
self.crawl_time = func.now()
68+
else:
69+
self.crawl_time = None
6670

6771
def __repr__(self):
6872
return 'user_id:{},follow_or_fans_id:{},type:{},from_where:{}'.format(self.user_id, self.follow_or_fans_id, self.type, self.from_where)

db/tables.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,6 @@
3232
Column("contact_info", String(300), default='', server_default=''),
3333
Column("education_info", String(300), default='', server_default=''),
3434
Column("head_img", String(500), default='', server_default=''),
35-
Column("isFan", INTEGER, default=0, server_default='0'),
3635
)
3736

3837
# seed ids for user crawling
@@ -120,8 +119,8 @@
120119
Column('follow_or_fans_id', String(20)),
121120
Column('type', INTEGER), # 1 stands for fans, 2 stands for follows
122121
Column('from_where', String(60)),
123-
Column('crawl_time', DateTime(3), default=func.now()) # DATETIME(6) means save 6 digits milliseconds
124-
# time is stored in UTC
122+
Column('crawl_time', DateTime(3)) # DATETIME(6) means save 6 digits milliseconds
123+
# time is stored in UTC
125124
)
126125

127126
# dialogue table

page_get/user.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,10 +63,11 @@ def get_url_from_web(user_id):
6363
elif domain == '100505':
6464
user = get_user_detail(user_id, html)
6565
samefollow_uid = get_samefollow_uid()
66-
if samefollow_uid:
66+
if samefollow_uid.strip() != '':
67+
samefollow_uid = samefollow_uid.split(',')
6768
url = SAMEFOLLOW_URL.format(user_id)
6869
isFanHtml = get_page(url, auth_level=2)
69-
user.isFan = person.get_isFan(isFanHtml, samefollow_uid)
70+
person.get_isFan(isFanHtml, samefollow_uid, user_id)
7071
# enterprise or service
7172
else:
7273
user = get_enterprise_detail(user_id, html)

page_parse/user/person.py

Lines changed: 43 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,12 @@
11
import re
2+
import json
23

34
from bs4 import BeautifulSoup
45

56
from ..user import public
67
from decorators import parse_decorator
7-
from db.models import User
8+
from db.models import (User, UserRelation)
9+
from db.dao import UserRelationOper
810

911

1012
@parse_decorator(0)
@@ -90,21 +92,21 @@ def get_detail(html, uid):
9092
user.description = description.encode('gbk', 'ignore').decode('gbk')
9193
elif '注册时间:' in each_str:
9294
user.register_time = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
93-
'\r\n', '')
95+
'\r\n', '').replace(' ', '')
9496

9597
if '标签信息' in basic_str:
9698
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
9799
for each in basic_info:
98100
if '标签:' in each.get_text():
99101
user.tags = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\t', '').replace(
100-
'\n\n\n', '') .strip().replace('\r\n', ';')
102+
'\n\n\n', '') .strip().replace('\r\n', ';').replace(' ', '')
101103

102104
if '教育信息' in basic_str:
103105
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
104106
for each in basic_info:
105107
if '大学:' in each.get_text():
106108
user.education_info = each.find(attrs={'class': 'pt_detail'}).get_text().replace('\r\n', ',') \
107-
.replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';')
109+
.replace('\t', '').replace('\n', ';').lstrip(';').rstrip(';').replace(' ', '')
108110

109111
if '工作信息' in basic_str:
110112
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
@@ -114,7 +116,7 @@ def get_detail(html, uid):
114116
jobs = each.find_all(attrs={'class': 'pt_detail'})
115117
for job in jobs:
116118
jobs_info.append(job.get_text().replace('\r\n', '').replace('\t', '').replace('\n', ''))
117-
user.work_info = ';'.join(jobs_info)
119+
user.work_info = ';'.join(jobs_info).replace(' ', '')
118120

119121
if '联系信息' in basic_str:
120122
basic_info = each_module.find_all(attrs={'class': 'li_1 clearfix'})
@@ -127,26 +129,56 @@ def get_detail(html, uid):
127129
contact_info.append('email:' + each.find(attrs={'class': 'pt_detail'}).get_text())
128130
if 'MSN:' in each.get_text():
129131
contact_info.append('msn:' + each.find(attrs={'class': 'pt_detail'}).get_text())
130-
user.contact_info = ';'.join(contact_info)
132+
user.contact_info = ';'.join(contact_info).replace(' ', '')
131133
except Exception as why:
132134
print('解析出错,具体原因为{why}'.format(why=why))
133135

134136
return user
135137

136138

137139
@parse_decorator(None)
138-
def get_isFan(html, uid):
140+
def get_isFan(html, uids, current_uid):
139141
"""
140142
:param html: samefollow page
141-
:param uid : whether this account follows uid
143+
:param uids: list contains uids to determine this account follows or not
144+
:param current_uid: current crawling user
142145
:return: 1 for yes 0 for no
143146
"""
144147
soup = BeautifulSoup(html, "html.parser")
145148
scripts = soup.find_all('script')
146149
pattern = re.compile(r'FM.view\((.*)\)')
147150

151+
user_ids = list() # Contains uids that the user and crawler both follow
152+
intersection_ids = list() # Contains the intersection of param uids and user_ids
153+
relations = list() # Contains list to be stored in UserRelation table
148154
for script in scripts:
149155
m = pattern.search(script.string)
150-
if m and uid in script.string:
151-
return 1
152-
return 0
156+
# Find the <script>FM.view({"ns":"pl.content.followTab.index","domid":"Pl_Official_HisRelation__59",...
157+
if m and 'pl.content.followTab.index' in script.string:
158+
all_info = m.group(1)
159+
cont = json.loads(all_info).get('html', '')
160+
soup = BeautifulSoup(cont, 'html.parser')
161+
follows = soup.find(attrs={'class': 'follow_box'}).find_all(attrs={'class': 'follow_item'})
162+
patternUID = re.compile(r'uid=(.*?)&')
163+
for follow in follows:
164+
m = re.search(patternUID, str(follow))
165+
if m:
166+
r = m.group(1)
167+
# filter invalid ids
168+
if r.isdigit():
169+
user_ids.append(r)
170+
# Most the same with def get_fans_or_follows(html, uid, type):
171+
# Except the following lines calculate which uids do the user follow
172+
intersection_ids = list(set(user_ids).intersection(set(uids)))
173+
# Now store in the database
174+
type = 1
175+
n = None
176+
for uid in intersection_ids:
177+
relations.append(UserRelation(uid, current_uid, type, n, False))
178+
UserRelationOper.add_all(relations)
179+
break
180+
# legacy support
181+
if intersection_ids:
182+
return 1
183+
else:
184+
return 0

page_parse/user/public.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -196,6 +196,7 @@ def get_fans_or_follows(html, uid, type):
196196
n = n[2:len(n)-2]
197197
user_ids.append(r)
198198
relations.append(UserRelation(uid, r, type, n))
199+
break
199200

200201
UserRelationOper.add_all(relations)
201202
return user_ids

0 commit comments

Comments
 (0)