Skip to content

Commit 4a8eb2e

Browse files
sunnywaldensunnywalden
sunnywalden
authored and
sunnywalden
committed
完成五大联赛数据爬取
1 parent 03904d5 commit 4a8eb2e

File tree

10 files changed

+325
-12724
lines changed

10 files changed

+325
-12724
lines changed

.gitignore

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
.idea
2+
.venv
3+
logs/*
4+
*.pyc
5+
*.swp
6+
*.swo
7+
*.tmp
8+
*.log
9+
*.lock

bigfiveleagues/items.py

Lines changed: 23 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -10,36 +10,33 @@
1010

1111
class LeagueItem(scrapy.Item):
1212
# define the fields for your item here like:
13-
name = scrapy.Field()
14-
league_uname = scrapy.Field()
15-
img_urls = scrapy.Field()
16-
# league_describe = scrapy.Field()
17-
league_clubs = scrapy.Field()
13+
id = scrapy.Field()
14+
name = scrapy.Field()
15+
file_urls = scrapy.Field()
1816

1917
class ClubItem(scrapy.Item):
2018
# define the fields for your item here like:
21-
club_league = scrapy.Field()
22-
name = scrapy.Field()
23-
club_uname = scrapy.Field()
24-
img_urls = scrapy.Field()
19+
id = scrapy.Field()
20+
club_league = scrapy.Field()
21+
name = scrapy.Field()
22+
file_urls = scrapy.Field()
2523
# club_describe = scrapy.Field()
26-
club_manager = scrapy.Field()
27-
club_players = scrapy.Field()
28-
club_ceo = scrapy.Field()
29-
club_soccerfield = scrapy.Field()
24+
club_manager = scrapy.Field()
25+
club_players = scrapy.Field()
26+
club_ceo = scrapy.Field()
27+
club_soccerfield = scrapy.Field()
3028

3129
class PlayerItem(scrapy.Item):
30+
id = scrapy.Field()
3231
# define the fields for your item here like:
33-
player_league = scrapy.Field()
34-
player_club = scrapy.Field()
35-
name = scrapy.Field()
36-
player_uname = scrapy.Field()
37-
img_urls = scrapy.Field()
38-
# player_describe = scrapy.Field()
39-
player_number = scrapy.Field()
40-
player_position = scrapy.Field()
41-
player_nationality = scrapy.Field()
42-
player_high = scrapy.Field()
43-
player_weight = scrapy.Field()
44-
player_age = scrapy.Field()
45-
player_networth = scrapy.Field()
32+
player_league = scrapy.Field()
33+
player_club = scrapy.Field()
34+
name = scrapy.Field()
35+
file_urls = scrapy.Field()
36+
player_number = scrapy.Field()
37+
player_position = scrapy.Field()
38+
player_nationality = scrapy.Field()
39+
player_high = scrapy.Field()
40+
player_weight = scrapy.Field()
41+
player_age = scrapy.Field()
42+
player_networth = scrapy.Field()

bigfiveleagues/logs/leagues.log

Lines changed: 0 additions & 12311 deletions
This file was deleted.

bigfiveleagues/middlewares.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# -*- coding: utf-8 -*-
2+
import time
23

34
# Define here the models for your spider middleware
45
#
@@ -125,7 +126,7 @@ def process_response(self, request, response, spider):
125126
def get_random_proxy(self):
126127
'''随机从文件中读取proxy'''
127128
while 1:
128-
with open('/tmp/proxies.txt', 'r') as f:
129+
with open('/tmp/proxies_leagues.txt', 'r') as f:
129130
proxies = f.readlines()
130131
if proxies:
131132
break

bigfiveleagues/pipelines.py

Lines changed: 135 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# -*- coding: utf-8 -*-
2+
import os
23

4+
from scrapy.pipelines.files import FilesPipeline
35
# Define your item pipelines here
46
#
57
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
@@ -10,150 +12,154 @@
1012
from scrapy import Request
1113
import pymysql
1214
from .items import LeagueItem,ClubItem,PlayerItem
13-
from scrapy.utils.project import get_project_settings #导入seetings配置
15+
from scrapy.utils.project import get_project_settings #导入settings配置
1416
import logging
1517

1618

1719
logger = logging.getLogger(__name__)
1820

19-
def dbHandle():
20-
'''1、@classmethod声明一个类方法,而对于平常我们见到的叫做实例方法。
21-
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
22-
3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
23-
#读取settings中配置的数据库参数
24-
settings = get_project_settings()
25-
conn = pymysql.connect(
26-
host=settings['MYSQL_HOST'],
27-
db=settings['MYSQL_DBNAME'],
28-
user=settings['MYSQL_USER'],
29-
passwd=settings['MYSQL_PASSWD'],
30-
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
31-
cursorclass=pymysql.cursors.DictCursor,
32-
)
33-
if conn:
34-
logger.info('Connect to mysql success!')
35-
return conn
21+
class FileDownloadPipeline(FilesPipeline):
22+
def __init__(self, store_uri, download_func=None, settings=None):
23+
if settings is None:
24+
settings = get_project_settings()
25+
super().__init__(store_uri, download_func, settings)
26+
self.logger = logging.getLogger(__name__)
27+
# self.img_store = settings.get('FILES_STORE')
3628

37-
class ImgDownloadPipeline(ImagesPipeline):
38-
logger = logging.getLogger(__name__)
29+
@classmethod
30+
def from_settings(cls, settings):
31+
store_uri = settings.get('FILES_STORE')
32+
return cls(store_uri, settings=settings)
3933

40-
def get_media_requests(self, item, info):
41-
for img_url in item['img_urls']:
34+
35+
def get_media_requests(self, item, info):
36+
for img_url in item['file_urls']:
4237
self.logger.info('Start download image %s', img_url)
43-
yield Request(img_url,meta={'item':item,'index':item['img_urls'].index(img_url)})
38+
yield Request(img_url,meta={'item':item,'index':item['file_urls'].index(img_url)})
39+
40+
def media_failed(self, failure, request, info):
41+
self.logger.error(f"File (unknown-error): Error downloading file from {request.url} referred in {info.spider.name}: {failure}")
42+
return super().media_failed(failure, request, info)
4443

45-
def file_path(self, request, response=None, info=None):
46-
item = request.meta['item'] # 通过上面的meta传递过来item
47-
index = request.meta['index']
48-
if item.get('player_club'):
49-
self.logger.info('player %s of club %s of league %s info scrapy now', \
50-
item['player_name'],item['player_club'],item['player_league'])
51-
logo_name = item['player_club'] + '_' + item['name'] + '.jpg'
52-
img_path = "%s%s"%(self.img_store, item['player_league'], item['player_club'])
53-
elif item.get('club_league'):
54-
self.logger.info('club %s of league %s info scrapy now',item['name'],item['club_league'])
55-
logo_name = item['name'] + '.jpg'
56-
img_path = "%s%s"%(self.img_store, item['club_league'])
57-
else:
58-
self.logger.info('league %s info scrapy now',item['name'])
59-
logo_name = item['name'] + '.jpg'
60-
img_path = "%s%s"%(self.img_store, item['name'])
61-
if os.path.exists(img_path) == False:
62-
os.mkdir(img_path)
63-
self.logger.info('the path item pircture to save is %s', img_path + logo_name)
44+
def file_path(self, request, response=None, info=None, *, item=None):
45+
item = request.meta['item'] # 通过上面的meta传递过来item
46+
index = request.meta['index']
47+
if item.get('player_number'):
48+
self.logger.info('player %s info scrapy now',
49+
item['name'])
50+
logo_name = item['name'] + '.jpg'
51+
img_path = "%s/" % 'players'
52+
elif item.get('club_manager'):
53+
self.logger.info('club %s info scrapy now',item['name'])
54+
logo_name = item['name'] + '.jpg'
55+
img_path = "%s/" % 'clubs'
56+
else:
57+
self.logger.info('league %s info scrapy now',item['name'])
58+
logo_name = item['name'] + '.jpg'
59+
img_path = "%s/" % 'leagues'
60+
if not os.path.exists(img_path):
61+
os.mkdir(img_path)
62+
self.logger.info('the path item picture to save is %s', img_path + logo_name)
6463
# return logo_name
65-
final_file = img_path + logo_name
66-
return final_file
64+
final_file = img_path + logo_name
65+
return final_file
6766

6867
class LeaguesItemPipeline(object):
69-
'''保存到数据库中对应的class
68+
"""保存到数据库中对应的class
7069
1、在settings.py文件中配置
71-
2、在自己实现的爬虫类中yield item,会自动执行'''
70+
2、在自己实现的爬虫类中yield item,会自动执行"""
7271
logger = logging.getLogger(__name__)
72+
73+
def dbHandle(self):
74+
'''1、@classmethod声明一个类方法,而对于平常我们见到的叫做实例方法。
75+
2、类方法的第一个参数cls(class的缩写,指这个类本身),而实例方法的第一个参数是self,表示该类的一个实例
76+
3、可以通过类来调用,就像C.f(),相当于java中的静态方法'''
77+
#读取settings中配置的数据库参数
78+
settings = get_project_settings()
79+
conn = pymysql.connect(
80+
host=settings['MYSQL_HOST'],
81+
db=settings['MYSQL_DBNAME'],
82+
user=settings['MYSQL_USER'],
83+
passwd=settings['MYSQL_PASSWD'],
84+
charset='utf8', # 编码要加上,否则可能出现中文乱码问题
85+
cursorclass=pymysql.cursors.DictCursor,
86+
)
87+
if conn:
88+
logger.info('Connect to mysql success!')
89+
return conn
7390
# pipeline默认调用
7491
def process_item(self, item, spider):
75-
# 写入数据库中
76-
# SQL语句在这里
77-
conn = dbHandle()
78-
cursor = conn.cursor()
79-
if isinstance(item, LeagueItem):
80-
self.logger.info('Handle league %s item now',item['name'])
81-
sql = "insert ignore into leagues values(%s,%s,%s,%s)"
82-
clubs = ''
83-
for club in item['league_clubs']:
84-
clubs += club + ','
85-
for logo_url in item['img_urls']:
86-
final_logo_url = logo_url
87-
params = (item['league_uname'], item['name'], final_logo_url, clubs)
88-
self.logger.info(sql,item['league_uname'], item['name'], final_logo_url, clubs)
89-
elif isinstance(item, ClubItem):
90-
self.logger.info('Handle club %s item now',item['name'])
92+
# 写入数据库中
93+
# SQL语句在这里
94+
global sql, params
95+
conn = self.dbHandle()
96+
cursor = conn.cursor()
97+
if isinstance(item, LeagueItem):
98+
self.logger.info('Handle league %s item now',item['name'])
99+
sql = "insert ignore into leagues values(%s,%s,%s)"
100+
final_logo_url = ''
101+
for logo_url in item['file_urls']:
102+
final_logo_url = logo_url
103+
params = (item['id'], item['name'], final_logo_url)
104+
self.logger.info(sql,item['id'], item['name'], final_logo_url)
105+
elif isinstance(item, ClubItem):
106+
self.logger.info('Handle club %s item now',item['name'])
91107
# self.logger.info(clubs)
92-
sql = "insert ignore into clubs values(%s,%s,%s,%s,%s,%s,%s,%s)"
93-
players = ''
94-
for player in item['club_players']:
95-
players += player + ','
96-
final_club_logo_url = ''
97-
for club_logo_url in item['img_urls']:
98-
final_club_logo_url = club_logo_url
99-
params = (
100-
item['club_league']+'-'+item['name'],
101-
item['club_league'],
102-
item['name'],
103-
item['club_uname'],
104-
final_club_logo_url,
105-
item['club_manager'],
106-
players,
107-
item['club_soccerfield']
108-
)
109-
self.logger.info(sql,item['club_league']+'-'+item['name'], \
110-
item['club_league'], \
111-
item['name'], \
112-
item['club_uname'], \
113-
final_club_logo_url, \
114-
item['club_manager'], \
115-
players, \
116-
item['club_soccerfield'])
117-
elif isinstance(item, PlayerItem):
118-
self.logger.info('Handle player %s item now',item['name'])
119-
player_portrait_url = ''
120-
for portrait_url in item['img_urls']:
108+
sql = "insert ignore into clubs values(%s,%s,%s,%s,%s,%s,%s,%s)"
109+
players = ''
110+
for player in item['club_players']:
111+
players += player + ','
112+
final_club_logo_url = ''
113+
for club_logo_url in item['file_urls']:
114+
final_club_logo_url = club_logo_url
115+
params = (
116+
item['id'],
117+
item['name'],
118+
final_club_logo_url,
119+
item['club_manager'],
120+
item['club_soccerfield']
121+
)
122+
self.logger.info(sql,
123+
item['id'],
124+
item['name'],
125+
final_club_logo_url,
126+
item['club_manager'],
127+
item['club_soccerfield'])
128+
elif isinstance(item, PlayerItem):
129+
self.logger.info('Handle player %s item now',item['name'])
130+
player_portrait_url = ''
131+
for portrait_url in item['file_urls']:
121132
player_portrait_url = portrait_url
122-
sql = "insert ignore into players values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
123-
params = (
124-
item['player_club']+'-'+item['name'],
125-
item['player_league'],
126-
item['player_club'],
127-
item['name'],
128-
item['player_uname'],
129-
player_portrait_url,
130-
item['player_number'],
131-
item['player_position'],
132-
item['player_nationality'],
133-
item['player_high'],
134-
item['player_weight'],
135-
item['player_age'],
136-
item['player_networth']
137-
)
138-
self.logger.info(sql,item['player_club']+'-'+item['name'],
139-
item['player_league'], \
140-
item['player_club'], \
141-
item['name'], \
142-
item['player_uname'], \
143-
player_portrait_url, \
144-
item['player_number'], \
145-
item['player_position'], \
146-
item['player_nationality'], \
147-
item['player_high'], \
148-
item['player_weight'], \
149-
item['player_age'], \
150-
item['player_networth'])
151-
try:
152-
cursor.execute(sql, params)
153-
conn.commit()
154-
# 关闭连接
155-
except Exception as error:
156-
# 出现错误时打印错误日志
157-
self.logger.info(error)
158-
conn.rollback()
159-
return item
133+
sql = "insert ignore into players values(%s,%s,%s,%s,%s)"
134+
params = (
135+
item['id'],
136+
item['name'],
137+
player_portrait_url,
138+
item['player_number'],
139+
item['player_position'],
140+
item['player_nationality'],
141+
item['player_high'],
142+
item['player_weight'],
143+
item['player_age'],
144+
item['player_networth']
145+
)
146+
self.logger.info(sql,
147+
item['id'],
148+
item['name'],
149+
player_portrait_url,
150+
item['player_number'],
151+
item['player_position'],
152+
item['player_nationality'],
153+
item['player_high'],
154+
item['player_weight'],
155+
item['player_age'],
156+
item['player_networth'])
157+
try:
158+
cursor.execute(sql, params)
159+
conn.commit()
160+
# 关闭连接
161+
except Exception as error:
162+
# 出现错误时打印错误日志
163+
self.logger.info(error)
164+
conn.rollback()
165+
return item

0 commit comments

Comments
 (0)