Merge pull request #87 from goodbest/master

ResolveWang · web-flow · commit 2ebee1a92317 · 2018-04-20T09:55:43.000+08:00
抓评论时加入表情功能
diff --git a/db/basic.py b/db/basic.py
@@ -11,7 +11,7 @@
 def get_engine():
     args = get_db_args()
     password = os.getenv('DB_PASS', args['password'])
-    connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8".format(args['db_type'], args['user'], password,
+    connect_str = "{}+pymysql://{}:{}@{}:{}/{}?charset=utf8mb4".format(args['db_type'], args['user'], password,
                                                              args['host'], args['port'], args['db_name'])
     engine = create_engine(connect_str, encoding='utf-8')
     return engine
diff --git a/db/tables.py b/db/tables.py
@@ -85,6 +85,7 @@
                       Column("id", INTEGER, primary_key=True, autoincrement=True),
                       Column("comment_id", String(50), unique=True),
                       Column("comment_cont", Text),
+                      Column("comment_screen_name", Text),
                       Column("weibo_id", String(200)),
                       Column("user_id", String(20)),
                       Column("create_time", String(200)),
diff --git a/page_parse/comment.py b/page_parse/comment.py
@@ -5,7 +5,7 @@
 from logger import parser
 from db.models import WeiboComment
 from decorators import parse_decorator
-
+from utils import parse_emoji
 
 @parse_decorator('')
 def get_html_cont(html):
@@ -63,14 +63,52 @@ def get_comment_list(html, wb_id):
     if not cont:
         return list()
 
-    soup = BeautifulSoup(cont, 'html.parser')
+    soup = BeautifulSoup(cont, 'html5lib')
     comment_list = list()
     comments = soup.find(attrs={'node-type': 'comment_list'}).find_all(attrs={'class': 'list_li S_line1 clearfix'})
 
     for comment in comments:
         wb_comment = WeiboComment()
         try:
-            wb_comment.comment_cont = comment.find(attrs={'class': 'WB_text'}).text.strip()
+            cont = []
+            first_author=True
+            first_colon=True
+            for content in comment.find(attrs={'class': 'WB_text'}).contents:
+                if not content:
+                    continue
+                if content.name =='a':
+                    if first_author:
+                        first_author=False
+                        continue
+                    else:
+                        if content.text:
+                            cont.append(content.text)
+                    
+                elif content.name=='img':
+                    img_title = content.get('title', '')
+                    if img_title=='':
+                        img_title = content.get('alt', '')
+                    if img_title=='':
+                        img_src = content.get('src','')
+                        img_src = img_src.split('/')[-1].split('.',1)[0]
+                        try:
+                            img_title = parse_emoji.softband_to_utf8(img_src)
+                        except Exception as e:
+                            parser.error('解析表情失败，具体信息是{},{}'.format(e, comment))
+                            img_title = ''
+                    cont.append(img_title)
+
+                else:
+                    if first_colon:
+                        if content.find('：')==0:
+                            cont.append(content.replace('：','',1))
+                            first_colon=False
+                    else:            
+                        cont.append(content)
+
+            wb_comment.comment_cont = ''.join(cont)
+            wb_comment.comment_screen_name =comment.find(attrs={'class': 'WB_text'}).find('a').text
+            
             wb_comment.comment_id = comment['comment_id']
             # TODO 将wb_comment.user_id加入待爬队列（seed_ids）
             wb_comment.user_id = comment.find(attrs={'class': 'WB_text'}).find('a').get('usercard')[3:]
diff --git a/utils/emoji_ios6.json b/utils/emoji_ios6.json
diff --git a/utils/parse_emoji.py b/utils/parse_emoji.py
@@ -0,0 +1,21 @@
+import json
+
+
+
+def load_emoji_map(fn = 'utils/emoji_ios6.json'):
+	json_data = json.load(open(fn, encoding='utf-8'))
+	sb_dict = {}
+	for m in json_data:
+		sb_dict[m['sb'].lower()]=m['utf8']
+	return sb_dict
+
+
+def softband_to_utf8(emoji):
+    hex_emoji = sb_dict.get(emoji.lower(), '')
+    if hex_emoji:
+        return bytes.fromhex(hex_emoji).decode('utf-8')
+    else:
+        return '' 
+
+sb_dict = load_emoji_map()
+