Skip to content

Commit eb1e1e6

Browse files
committed
提交代码
1 parent c104012 commit eb1e1e6

18 files changed

+382
-1
lines changed

fans/README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@ Python技术 公众号文章代码库
1616

1717
[PyAutoGUI,轻松搞定图片上传!](https://github.com/JustDoPython/python-examples/tree/master/fans/imgupload):PyAutoGUI,轻松搞定图片上传!
1818

19-
19+
[为了买车,我爬了懂车帝!](https://github.com/JustDoPython/python-examples/tree/master/fans/scrapydcd):为了买车,我爬了懂车帝!
2020

2121

2222

fans/scrapydcd/dcd/dcd/__init__.py

Whitespace-only changes.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
# -*- coding: utf-8 -*-
2+
import time
3+
from selenium import webdriver
4+
from scrapy.http.response.html import HtmlResponse
5+
6+
class DcdDownloaderMiddleware(object):
7+
8+
def __init__(self):
9+
# 加载测试浏览器
10+
options = webdriver.ChromeOptions()
11+
options.add_argument('--no-sandbox')
12+
options.add_argument('--disable-gpu')
13+
options.add_argument('--ignore-certificate-errors')
14+
options.add_argument('--ignore-ssl-errors')
15+
16+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
17+
self.driver.maximize_window()
18+
19+
#重写process_request方法
20+
def process_request(self, request, spider):
21+
print('request.url',request.url)
22+
self.driver.get(request.url)
23+
js = 'return document.body.scrollHeight;'
24+
height = 0
25+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
26+
while True:
27+
new_height = self.driver.execute_script(js)
28+
if new_height > height:
29+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
30+
height = new_height
31+
time.sleep(1)
32+
else:
33+
print("滚动条已经处于页面最下方!")
34+
break
35+
source = self.driver.page_source
36+
# 创建一个response对象,把页面信息都封装在reponse对象中
37+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
38+
return response

fans/scrapydcd/dcd/dcd/items.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
# Define here the models for your scraped items
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/items.html
5+
6+
import scrapy
7+
8+
9+
class DcdItem(scrapy.Item):
10+
#品牌
11+
brand = scrapy.Field()
12+
#车型
13+
name = scrapy.Field()
14+
#评分
15+
score = scrapy.Field()
16+
#特点
17+
title = scrapy.Field()

fans/scrapydcd/dcd/dcd/middlewares.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Define here the models for your spider middleware
2+
#
3+
# See documentation in:
4+
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
5+
6+
import time
7+
from selenium import webdriver
8+
from scrapy import signals
9+
from scrapy.http.response.html import HtmlResponse
10+
11+
# useful for handling different item types with a single interface
12+
from itemadapter import is_item, ItemAdapter
13+
14+
15+
class DcdSpiderMiddleware:
16+
# Not all methods need to be defined. If a method is not defined,
17+
# scrapy acts as if the spider middleware does not modify the
18+
# passed objects.
19+
20+
@classmethod
21+
def from_crawler(cls, crawler):
22+
# This method is used by Scrapy to create your spiders.
23+
s = cls()
24+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
25+
return s
26+
27+
def process_spider_input(self, response, spider):
28+
# Called for each response that goes through the spider
29+
# middleware and into the spider.
30+
31+
# Should return None or raise an exception.
32+
return None
33+
34+
def process_spider_output(self, response, result, spider):
35+
# Called with the results returned from the Spider, after
36+
# it has processed the response.
37+
38+
# Must return an iterable of Request, or item objects.
39+
for i in result:
40+
yield i
41+
42+
def process_spider_exception(self, response, exception, spider):
43+
# Called when a spider or process_spider_input() method
44+
# (from other spider middleware) raises an exception.
45+
46+
# Should return either None or an iterable of Request or item objects.
47+
pass
48+
49+
def process_start_requests(self, start_requests, spider):
50+
# Called with the start requests of the spider, and works
51+
# similarly to the process_spider_output() method, except
52+
# that it doesn’t have a response associated.
53+
54+
# Must return only requests (not items).
55+
for r in start_requests:
56+
yield r
57+
58+
def spider_opened(self, spider):
59+
spider.logger.info('Spider opened: %s' % spider.name)
60+
61+
62+
class DcdDownloaderMiddleware:
63+
# Not all methods need to be defined. If a method is not defined,
64+
# scrapy acts as if the downloader middleware does not modify the
65+
# passed objects.
66+
67+
def __init__(self):
68+
# 加载测试浏览器
69+
options = webdriver.ChromeOptions()
70+
options.add_argument('--no-sandbox')
71+
options.add_argument('--disable-gpu')
72+
options.add_argument('--ignore-certificate-errors')
73+
options.add_argument('--ignore-ssl-errors')
74+
75+
self.driver = webdriver.Chrome(executable_path=r"C:\drf2\drf2\chromedriver.exe",options=options)
76+
self.driver.maximize_window()
77+
78+
@classmethod
79+
def from_crawler(cls, crawler):
80+
# This method is used by Scrapy to create your spiders.
81+
s = cls()
82+
crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
83+
return s
84+
85+
def process_request(self, request, spider):
86+
print('request.url',request.url)
87+
self.driver.get(request.url)
88+
js = 'return document.body.scrollHeight;'
89+
height = 0
90+
if request.url != 'https://www.dongchedi.com/auto/library/x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x-x':
91+
while True:
92+
new_height = self.driver.execute_script(js)
93+
if new_height > height:
94+
self.driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
95+
height = new_height
96+
time.sleep(1)
97+
else:
98+
print("滚动条已经处于页面最下方!")
99+
break
100+
source = self.driver.page_source
101+
# 创建一个response对象,把页面信息都封装在reponse对象中
102+
response = HtmlResponse(url=self.driver.current_url,body=source,request = request,encoding="utf-8")
103+
return response
104+
105+
def process_response(self, request, response, spider):
106+
# Called with the response returned from the downloader.
107+
108+
# Must either;
109+
# - return a Response object
110+
# - return a Request object
111+
# - or raise IgnoreRequest
112+
return response
113+
114+
def process_exception(self, request, exception, spider):
115+
# Called when a download handler or a process_request()
116+
# (from other downloader middleware) raises an exception.
117+
118+
# Must either:
119+
# - return None: continue processing this exception
120+
# - return a Response object: stops process_exception() chain
121+
# - return a Request object: stops process_exception() chain
122+
pass
123+
124+
def spider_opened(self, spider):
125+
spider.logger.info('Spider opened: %s' % spider.name)

fans/scrapydcd/dcd/dcd/pipelines.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
# Define your item pipelines here
2+
#
3+
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
4+
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
5+
6+
7+
# useful for handling different item types with a single interface
8+
from itemadapter import ItemAdapter
9+
10+
11+
class DcdPipeline:
12+
def process_item(self, item, spider):
13+
return item

0 commit comments

Comments
 (0)