11
11
12
12
from __future__ import print_function
13
13
14
+
14
15
import datetime
15
16
import shutil
16
17
import sys
31
32
else :
32
33
raise ImportError ("Error while importing Modules!" )
33
34
34
- import generators as gens
35
- import utils
36
- import config as cfg
37
- import structures
38
- import exceptions
39
-
40
-
35
+ gens = __import__ ('generators' )
36
+ utils = __import__ ('utils' )
37
+ cfg = __import__ ('config' )
38
+ structures = __import__ ('structures' )
39
+ exceptions = __import__ ('exceptions' )
41
40
42
- __all__ = [
43
- 'py3' , 'py2' , 'setup_config' , 'get' , 'now' , 'save_webpage' , 'wrap_up'
44
- ]
45
41
46
-
47
- def save_webpage (url , mirrors_dir , reset_config = True , ** kwargs ):
42
+ def save_webpage (url , mirrors_dir , reset_config = True , * args , ** kwargs ):
48
43
""" Starts crawler, archives and writes logs etc. """
49
44
50
- cfg .setup_config (url , mirrors_dir , ** kwargs )
45
+ cfg .setup_config (url , mirrors_dir , * args , * *kwargs )
51
46
52
47
# save the page
53
48
_crawl (cfg .config ['URL' ])
@@ -92,7 +87,7 @@ def wrap_up():
92
87
)'''
93
88
94
89
# NOTE: new method, less error prone
95
- # make zip archive of all the files
90
+ # make zip archive of all the files and not the empty folders
96
91
archive = zipfile .ZipFile (
97
92
os .path .abspath (cfg .config ['MIRRORS_DIR' ]) +
98
93
'.zip' , 'w' , zipfile .ZIP_DEFLATED
@@ -132,6 +127,9 @@ def wrap_up():
132
127
def _can_access (user_agent , url ):
133
128
""" Determines if user-agent is allowed to access url. """
134
129
130
+ if cfg .config ['robots' ].is_dummy :
131
+ return True
132
+
135
133
# check if website allows bot access
136
134
if not cfg .config ['ROBOTS' ].can_fetch (user_agent , url ) and not cfg .config ['BYPASS_ROBOTS' ]:
137
135
@@ -167,7 +165,7 @@ def get(url):
167
165
"""
168
166
169
167
if not _can_access ("*" , url ):
170
- raise exceptions .PermissionError ("Access to %s not allowed by site." % url )
168
+ raise exceptions .AccessError ("Access to %s not allowed by site." % url )
171
169
172
170
headers = {
173
171
"Accept-Language" : "en-US,en;q=0.5" ,
@@ -192,15 +190,15 @@ def get(url):
192
190
level = 4 ,
193
191
to_console = True
194
192
)
195
- raise exceptions . ConnectionError ( e . message )
193
+ raise e
196
194
197
195
except requests .exceptions .InvalidSchema as e :
198
196
now (
199
197
'error :: Invalid URL' ,
200
198
level = 4 ,
201
199
to_console = True
202
200
)
203
- raise exceptions . InvalidUrl ( e . message )
201
+ raise e
204
202
205
203
206
204
# -----------------------------------------------------------
@@ -292,7 +290,7 @@ def new_file(download_loc, content_url=None, content=None, mime_type='text/html'
292
290
f .write (content )
293
291
f .write (_water_mark )
294
292
295
- # last check if file was successfully
293
+ # last check if file was successfully written to the disk
296
294
assert os .path .isfile (download_loc )
297
295
298
296
cfg .config ['DOWNLOADED_FILES' ].append (download_loc )
@@ -356,6 +354,9 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
356
354
:param compressed: reduces the string length to 80 characters
357
355
"""
358
356
357
+ if cfg .config ['quiet' ]:
358
+ return
359
+
359
360
_event_level_strings = ["info" , "error" , "critical" , "success" ]
360
361
361
362
if level == 4 :
@@ -467,61 +468,62 @@ def _save_webpage(url):
467
468
crawlable_urls = list ()
468
469
469
470
470
- def _crawl (url ):
471
+ def _crawl (url , level = 0 , max_level = 2 ):
471
472
""" Scans pages for links to other pages to save in COPY_ALL mode.
472
473
"""
473
474
474
475
# if single webpage is requested
475
- if cfg .config ['copy_all' ] == False :
476
+ if not cfg .config ['copy_all' ]:
477
+
478
+ _save_webpage (url )
479
+ crawled_urls .append (url )
476
480
477
- if url not in crawled_urls :
478
- _save_webpage (url )
479
- crawled_urls .append (url )
481
+ return
480
482
483
+ # if max deep level is reached
484
+ if level == max_level :
481
485
return
482
486
483
- else :
484
- # crawler to extract all the valid page links on the given page
485
- now ('Trying to start crawler on url %s' % url )
486
-
487
- # make a request to the page
488
- req = get (url )
489
-
490
- # something went wrong; exit
491
- if req is None :
492
- now ('Crawler encountered an Error while requesting web page on url %s' %
493
- url , level = 4 )
494
- now ('Crawler Exiting!' , level = 4 )
495
- sys .exit (1 )
496
-
497
- # page found and working
498
- # make a soup of it
499
- soup = bs4 .BeautifulSoup (req .content , cfg .config ['parser' ])
500
-
501
- # select all the links on page
502
- a_tags = soup .find_all ('a' , href = True )
503
-
504
- # store absolute url of them in a separate dict
505
- for a_tag in a_tags :
506
- # create a absolute url
507
- _abs_url = utils .join_urls (url , a_tag .get ('href' , '' ))
508
-
509
- if _abs_url .startswith (url ) and utils .url_path (url ) not in ('' , '/' , '\\ ' ) and _abs_url not in crawled_urls and _abs_url not in crawlable_urls :
510
- crawlable_urls .append (url )
511
-
512
- # iter through all the links of website
513
- for _url in crawlable_urls :
514
- # if url is already saved
515
- if _url in crawled_urls :
516
- # go to the next url
517
- continue
518
-
519
- # otherwise save this url and add this to saved list
520
- _save_webpage (_url )
521
- crawled_urls .append (_url )
522
-
523
- # send this url again for url searching
524
- _crawl (_url )
487
+ # crawler to extract all the valid page links on the given page
488
+ now ('Trying to start crawler on url %s' % url , to_console = True )
489
+
490
+ # make a request to the page
491
+ req = get (url )
492
+
493
+ # something went wrong; exit
494
+ if not req .ok :
495
+ now ('Crawler encountered an Error while requesting web page on url %s' %
496
+ url , level = 4 , to_console = True )
497
+ now ('Crawler Exiting!' , level = 4 , to_console = True )
498
+ sys .exit (1 )
499
+
500
+ # page found and working
501
+ # make a soup of it
502
+ soup = bs4 .BeautifulSoup (req .content , cfg .config ['parser' ])
503
+
504
+ # select all the links on page
505
+ a_tags = soup .find_all ('a' , href = True )
506
+
507
+ # store absolute url of them
508
+ global crawlable_urls
509
+ crawlable_urls += set ([urlparse .urljoin (url , i .get ('href' , '' ))
510
+ for i in a_tags if urlparse .urljoin (url , i .get ('href' , '' )).startswith (url )])
511
+
512
+ # every url found will be checked and sent to be saved through the
513
+ # save_webpage method
514
+ for url in crawlable_urls :
515
+
516
+ # if url is already saved
517
+ if url in crawled_urls :
518
+ # go to the next url
519
+ continue
520
+
521
+ # otherwise save this url and add this to saved list
522
+ _save_webpage (url )
523
+ crawled_urls .append (url )
524
+
525
+ # send this url again for url searching
526
+ _crawl (url , level = (level + 1 ))
525
527
526
528
now ("Crawled URL list : " )
527
529
now ('\n ' .join (crawlable_urls ))
0 commit comments