Skip to content

Commit 2ebee1a

Browse files
authored
Merge pull request #87 from goodbest/master
抓评论时加入表情功能
2 parents 61de71e + 1a8c0cb commit 2ebee1a

File tree

5 files changed

+65
-4
lines changed

5 files changed

+65
-4
lines changed

db/basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
def get_engine():
1212
args = get_db_args()
1313
password = os.getenv('DB_PASS', args['password'])
14-
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8".format(args['db_type'], args['user'], password,
14+
connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4".format(args['db_type'], args['user'], password,
1515
args['host'], args['port'], args['db_name'])
1616
engine = create_engine(connect_str, encoding='utf-8')
1717
return engine

db/tables.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@
8585
Column("id", INTEGER, primary_key=True, autoincrement=True),
8686
Column("comment_id", String(50), unique=True),
8787
Column("comment_cont", Text),
88+
Column("comment_screen_name", Text),
8889
Column("weibo_id", String(200)),
8990
Column("user_id", String(20)),
9091
Column("create_time", String(200)),

page_parse/comment.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from logger import parser
66
from db.models import WeiboComment
77
from decorators import parse_decorator
8-
8+
from utils import parse_emoji
99

1010
@parse_decorator('')
1111
def get_html_cont(html):
@@ -63,14 +63,52 @@ def get_comment_list(html, wb_id):
6363
if not cont:
6464
return list()
6565

66-
soup = BeautifulSoup(cont, 'html.parser')
66+
soup = BeautifulSoup(cont, 'html5lib')
6767
comment_list = list()
6868
comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})
6969

7070
for comment in comments:
7171
wb_comment = WeiboComment()
7272
try:
73-
wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
73+
cont = []
74+
first_author=True
75+
first_colon=True
76+
for content in comment.find(attrs={'class': 'WB_text'}).contents:
77+
if not content:
78+
continue
79+
if content.name =='a':
80+
if first_author:
81+
first_author=False
82+
continue
83+
else:
84+
if content.text:
85+
cont.append(content.text)
86+
87+
elif content.name=='img':
88+
img_title = content.get('title', '')
89+
if img_title=='':
90+
img_title = content.get('alt', '')
91+
if img_title=='':
92+
img_src = content.get('src','')
93+
img_src = img_src.split('/')[-1].split('.',1)[0]
94+
try:
95+
img_title = parse_emoji.softband_to_utf8(img_src)
96+
except Exception as e:
97+
parser.error('解析表情失败,具体信息是{},{}'.format(e, comment))
98+
img_title = ''
99+
cont.append(img_title)
100+
101+
else:
102+
if first_colon:
103+
if content.find(':')==0:
104+
cont.append(content.replace(':','',1))
105+
first_colon=False
106+
else:
107+
cont.append(content)
108+
109+
wb_comment.comment_cont = ''.join(cont)
110+
wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text
111+
74112
wb_comment.comment_id = comment['comment_id']
75113
# TODO 将wb_comment.user_id加入待爬队列(seed_ids)
76114
wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]

utils/emoji_ios6.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.

utils/parse_emoji.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import json
2+
3+
4+
5+
def load_emoji_map(fn = 'utils/emoji_ios6.json'):
6+
json_data = json.load(open(fn, encoding='utf-8'))
7+
sb_dict = {}
8+
for m in json_data:
9+
sb_dict[m['sb'].lower()]=m['utf8']
10+
return sb_dict
11+
12+
13+
def softband_to_utf8(emoji):
14+
hex_emoji = sb_dict.get(emoji.lower(), '')
15+
if hex_emoji:
16+
return bytes.fromhex(hex_emoji).decode('utf-8')
17+
else:
18+
return ''
19+
20+
sb_dict = load_emoji_map()
21+

0 commit comments

Comments
 (0)