|
1 | 1 | # -*- coding: utf-8 -*-
|
| 2 | +import os |
2 | 3 |
|
| 4 | +from scrapy.pipelines.files import FilesPipeline |
3 | 5 | # Define your item pipelines here
|
4 | 6 | #
|
5 | 7 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
|
|
10 | 12 | from scrapy import Request
|
11 | 13 | import pymysql
|
12 | 14 | from .items import LeagueItem,ClubItem,PlayerItem
|
13 |
| -from scrapy.utils.project import get_project_settings #导入seetings配置 |
| 15 | +from scrapy.utils.project import get_project_settings #导入settings配置 |
14 | 16 | import logging
|
15 | 17 |
|
16 | 18 |
|
17 | 19 | logger = logging.getLogger(__name__)
|
18 | 20 |
|
19 |
| -def dbHandle(): |
20 |
| - '''1、@classmethod声明一个类方法,而对于平常我们见到的叫做实例方法。 |
21 |
| - 2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例 |
22 |
| - 3、可以通过类来调用,就像C.f(),相当于java中的静态方法''' |
23 |
| - #读取settings中配置的数据库参数 |
24 |
| - settings = get_project_settings() |
25 |
| - conn = pymysql.connect( |
26 |
| - host=settings['MYSQL_HOST'], |
27 |
| - db=settings['MYSQL_DBNAME'], |
28 |
| - user=settings['MYSQL_USER'], |
29 |
| - passwd=settings['MYSQL_PASSWD'], |
30 |
| - charset='utf8', # 编码要加上,否则可能出现中文乱码问题 |
31 |
| - cursorclass=pymysql.cursors.DictCursor, |
32 |
| - ) |
33 |
| - if conn: |
34 |
| - logger.info('Connect to mysql success!') |
35 |
| - return conn |
| 21 | +class FileDownloadPipeline(FilesPipeline): |
| 22 | + def __init__(self, store_uri, download_func=None, settings=None): |
| 23 | + if settings is None: |
| 24 | + settings = get_project_settings() |
| 25 | + super().__init__(store_uri, download_func, settings) |
| 26 | + self.logger = logging.getLogger(__name__) |
| 27 | + # self.img_store = settings.get('FILES_STORE') |
36 | 28 |
|
37 |
| -class ImgDownloadPipeline(ImagesPipeline): |
38 |
| - logger = logging.getLogger(__name__) |
| 29 | + @classmethod |
| 30 | + def from_settings(cls, settings): |
| 31 | + store_uri = settings.get('FILES_STORE') |
| 32 | + return cls(store_uri, settings=settings) |
39 | 33 |
|
40 |
| - def get_media_requests(self, item, info): |
41 |
| - for img_url in item['img_urls']: |
| 34 | + |
| 35 | + def get_media_requests(self, item, info): |
| 36 | + for img_url in item['file_urls']: |
42 | 37 | self.logger.info('Start download image %s', img_url)
|
43 |
| - yield Request(img_url,meta={'item':item,'index':item['img_urls'].index(img_url)}) |
| 38 | + yield Request(img_url,meta={'item':item,'index':item['file_urls'].index(img_url)}) |
| 39 | + |
| 40 | + def media_failed(self, failure, request, info): |
| 41 | + self.logger.error(f"File (unknown-error): Error downloading file from {request.url} referred in {info.spider.name}: {failure}") |
| 42 | + return super().media_failed(failure, request, info) |
44 | 43 |
|
45 |
| - def file_path(self, request, response=None, info=None): |
46 |
| - item = request.meta['item'] # 通过上面的meta传递过来item |
47 |
| - index = request.meta['index'] |
48 |
| - if item.get('player_club'): |
49 |
| - self.logger.info('player %s of club %s of league %s info scrapy now', \ |
50 |
| - item['player_name'],item['player_club'],item['player_league']) |
51 |
| - logo_name = item['player_club'] + '_' + item['name'] + '.jpg' |
52 |
| - img_path = "%s%s"%(self.img_store, item['player_league'], item['player_club']) |
53 |
| - elif item.get('club_league'): |
54 |
| - self.logger.info('club %s of league %s info scrapy now',item['name'],item['club_league']) |
55 |
| - logo_name = item['name'] + '.jpg' |
56 |
| - img_path = "%s%s"%(self.img_store, item['club_league']) |
57 |
| - else: |
58 |
| - self.logger.info('league %s info scrapy now',item['name']) |
59 |
| - logo_name = item['name'] + '.jpg' |
60 |
| - img_path = "%s%s"%(self.img_store, item['name']) |
61 |
| - if os.path.exists(img_path) == False: |
62 |
| - os.mkdir(img_path) |
63 |
| - self.logger.info('the path item pircture to save is %s', img_path + logo_name) |
| 44 | + def file_path(self, request, response=None, info=None, *, item=None): |
| 45 | + item = request.meta['item'] # 通过上面的meta传递过来item |
| 46 | + index = request.meta['index'] |
| 47 | + if item.get('player_number'): |
| 48 | + self.logger.info('player %s info scrapy now', |
| 49 | + item['name']) |
| 50 | + logo_name = item['name'] + '.jpg' |
| 51 | + img_path = "%s/" % 'players' |
| 52 | + elif item.get('club_manager'): |
| 53 | + self.logger.info('club %s info scrapy now',item['name']) |
| 54 | + logo_name = item['name'] + '.jpg' |
| 55 | + img_path = "%s/" % 'clubs' |
| 56 | + else: |
| 57 | + self.logger.info('league %s info scrapy now',item['name']) |
| 58 | + logo_name = item['name'] + '.jpg' |
| 59 | + img_path = "%s/" % 'leagues' |
| 60 | + if not os.path.exists(img_path): |
| 61 | + os.mkdir(img_path) |
| 62 | + self.logger.info('the path item picture to save is %s', img_path + logo_name) |
64 | 63 | # return logo_name
|
65 |
| - final_file = img_path + logo_name |
66 |
| - return final_file |
| 64 | + final_file = img_path + logo_name |
| 65 | + return final_file |
67 | 66 |
|
68 | 67 | class LeaguesItemPipeline(object):
|
69 |
| - '''保存到数据库中对应的class |
| 68 | + """保存到数据库中对应的class |
70 | 69 | 1、在settings.py文件中配置
|
71 |
| - 2、在自己实现的爬虫类中yield item,会自动执行''' |
| 70 | + 2、在自己实现的爬虫类中yield item,会自动执行""" |
72 | 71 | logger = logging.getLogger(__name__)
|
| 72 | + |
| 73 | + def dbHandle(self): |
| 74 | + '''1、@classmethod声明一个类方法,而对于平常我们见到的叫做实例方法。 |
| 75 | + 2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例 |
| 76 | + 3、可以通过类来调用,就像C.f(),相当于java中的静态方法''' |
| 77 | + #读取settings中配置的数据库参数 |
| 78 | + settings = get_project_settings() |
| 79 | + conn = pymysql.connect( |
| 80 | + host=settings['MYSQL_HOST'], |
| 81 | + db=settings['MYSQL_DBNAME'], |
| 82 | + user=settings['MYSQL_USER'], |
| 83 | + passwd=settings['MYSQL_PASSWD'], |
| 84 | + charset='utf8', # 编码要加上,否则可能出现中文乱码问题 |
| 85 | + cursorclass=pymysql.cursors.DictCursor, |
| 86 | + ) |
| 87 | + if conn: |
| 88 | + logger.info('Connect to mysql success!') |
| 89 | + return conn |
73 | 90 | # pipeline默认调用
|
74 | 91 | def process_item(self, item, spider):
|
75 |
| - # 写入数据库中 |
76 |
| - # SQL语句在这里 |
77 |
| - conn = dbHandle() |
78 |
| - cursor = conn.cursor() |
79 |
| - if isinstance(item, LeagueItem): |
80 |
| - self.logger.info('Handle league %s item now',item['name']) |
81 |
| - sql = "insert ignore into leagues values(%s,%s,%s,%s)" |
82 |
| - clubs = '' |
83 |
| - for club in item['league_clubs']: |
84 |
| - clubs += club + ',' |
85 |
| - for logo_url in item['img_urls']: |
86 |
| - final_logo_url = logo_url |
87 |
| - params = (item['league_uname'], item['name'], final_logo_url, clubs) |
88 |
| - self.logger.info(sql,item['league_uname'], item['name'], final_logo_url, clubs) |
89 |
| - elif isinstance(item, ClubItem): |
90 |
| - self.logger.info('Handle club %s item now',item['name']) |
| 92 | + # 写入数据库中 |
| 93 | + # SQL语句在这里 |
| 94 | + global sql, params |
| 95 | + conn = self.dbHandle() |
| 96 | + cursor = conn.cursor() |
| 97 | + if isinstance(item, LeagueItem): |
| 98 | + self.logger.info('Handle league %s item now',item['name']) |
| 99 | + sql = "insert ignore into leagues values(%s,%s,%s)" |
| 100 | + final_logo_url = '' |
| 101 | + for logo_url in item['file_urls']: |
| 102 | + final_logo_url = logo_url |
| 103 | + params = (item['id'], item['name'], final_logo_url) |
| 104 | + self.logger.info(sql,item['id'], item['name'], final_logo_url) |
| 105 | + elif isinstance(item, ClubItem): |
| 106 | + self.logger.info('Handle club %s item now',item['name']) |
91 | 107 | # self.logger.info(clubs)
|
92 |
| - sql = "insert ignore into clubs values(%s,%s,%s,%s,%s,%s,%s,%s)" |
93 |
| - players = '' |
94 |
| - for player in item['club_players']: |
95 |
| - players += player + ',' |
96 |
| - final_club_logo_url = '' |
97 |
| - for club_logo_url in item['img_urls']: |
98 |
| - final_club_logo_url = club_logo_url |
99 |
| - params = ( |
100 |
| - item['club_league']+'-'+item['name'], |
101 |
| - item['club_league'], |
102 |
| - item['name'], |
103 |
| - item['club_uname'], |
104 |
| - final_club_logo_url, |
105 |
| - item['club_manager'], |
106 |
| - players, |
107 |
| - item['club_soccerfield'] |
108 |
| - ) |
109 |
| - self.logger.info(sql,item['club_league']+'-'+item['name'], \ |
110 |
| - item['club_league'], \ |
111 |
| - item['name'], \ |
112 |
| - item['club_uname'], \ |
113 |
| - final_club_logo_url, \ |
114 |
| - item['club_manager'], \ |
115 |
| - players, \ |
116 |
| - item['club_soccerfield']) |
117 |
| - elif isinstance(item, PlayerItem): |
118 |
| - self.logger.info('Handle player %s item now',item['name']) |
119 |
| - player_portrait_url = '' |
120 |
| - for portrait_url in item['img_urls']: |
| 108 | + sql = "insert ignore into clubs values(%s,%s,%s,%s,%s,%s,%s,%s)" |
| 109 | + players = '' |
| 110 | + for player in item['club_players']: |
| 111 | + players += player + ',' |
| 112 | + final_club_logo_url = '' |
| 113 | + for club_logo_url in item['file_urls']: |
| 114 | + final_club_logo_url = club_logo_url |
| 115 | + params = ( |
| 116 | + item['id'], |
| 117 | + item['name'], |
| 118 | + final_club_logo_url, |
| 119 | + item['club_manager'], |
| 120 | + item['club_soccerfield'] |
| 121 | + ) |
| 122 | + self.logger.info(sql, |
| 123 | + item['id'], |
| 124 | + item['name'], |
| 125 | + final_club_logo_url, |
| 126 | + item['club_manager'], |
| 127 | + item['club_soccerfield']) |
| 128 | + elif isinstance(item, PlayerItem): |
| 129 | + self.logger.info('Handle player %s item now',item['name']) |
| 130 | + player_portrait_url = '' |
| 131 | + for portrait_url in item['file_urls']: |
121 | 132 | player_portrait_url = portrait_url
|
122 |
| - sql = "insert ignore into players values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" |
123 |
| - params = ( |
124 |
| - item['player_club']+'-'+item['name'], |
125 |
| - item['player_league'], |
126 |
| - item['player_club'], |
127 |
| - item['name'], |
128 |
| - item['player_uname'], |
129 |
| - player_portrait_url, |
130 |
| - item['player_number'], |
131 |
| - item['player_position'], |
132 |
| - item['player_nationality'], |
133 |
| - item['player_high'], |
134 |
| - item['player_weight'], |
135 |
| - item['player_age'], |
136 |
| - item['player_networth'] |
137 |
| - ) |
138 |
| - self.logger.info(sql,item['player_club']+'-'+item['name'], |
139 |
| - item['player_league'], \ |
140 |
| - item['player_club'], \ |
141 |
| - item['name'], \ |
142 |
| - item['player_uname'], \ |
143 |
| - player_portrait_url, \ |
144 |
| - item['player_number'], \ |
145 |
| - item['player_position'], \ |
146 |
| - item['player_nationality'], \ |
147 |
| - item['player_high'], \ |
148 |
| - item['player_weight'], \ |
149 |
| - item['player_age'], \ |
150 |
| - item['player_networth']) |
151 |
| - try: |
152 |
| - cursor.execute(sql, params) |
153 |
| - conn.commit() |
154 |
| - # 关闭连接 |
155 |
| - except Exception as error: |
156 |
| - # 出现错误时打印错误日志 |
157 |
| - self.logger.info(error) |
158 |
| - conn.rollback() |
159 |
| - return item |
| 133 | + sql = "insert ignore into players values(%s,%s,%s,%s,%s)" |
| 134 | + params = ( |
| 135 | + item['id'], |
| 136 | + item['name'], |
| 137 | + player_portrait_url, |
| 138 | + item['player_number'], |
| 139 | + item['player_position'], |
| 140 | + item['player_nationality'], |
| 141 | + item['player_high'], |
| 142 | + item['player_weight'], |
| 143 | + item['player_age'], |
| 144 | + item['player_networth'] |
| 145 | + ) |
| 146 | + self.logger.info(sql, |
| 147 | + item['id'], |
| 148 | + item['name'], |
| 149 | + player_portrait_url, |
| 150 | + item['player_number'], |
| 151 | + item['player_position'], |
| 152 | + item['player_nationality'], |
| 153 | + item['player_high'], |
| 154 | + item['player_weight'], |
| 155 | + item['player_age'], |
| 156 | + item['player_networth']) |
| 157 | + try: |
| 158 | + cursor.execute(sql, params) |
| 159 | + conn.commit() |
| 160 | + # 关闭连接 |
| 161 | + except Exception as error: |
| 162 | + # 出现错误时打印错误日志 |
| 163 | + self.logger.info(error) |
| 164 | + conn.rollback() |
| 165 | + return item |
0 commit comments