Skip to content

Commit 77b4eaf

Browse files
committed
v2.1 major fixes
1 parent d3bba5b commit 77b4eaf

17 files changed

+271
-261
lines changed

build/lib/pywebcopy/__init__.py

Lines changed: 20 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,23 +9,34 @@
99
1010
"""
1111

12-
import sys
13-
import core
14-
import structures
15-
import config
16-
import utils
17-
import generators
18-
import exceptions
19-
12+
from __future__ import absolute_import
2013

21-
__version__ = config.config['version']
2214
__author__ = 'Raja Tomar'
2315
__copyright__ = 'Copyright Aeroson Systems & Co.'
2416
__license__ = 'Licensed under MIT'
2517
__email__ = 'rajatomar788@gmail.com'
18+
__package__ = 'pywebcopy'
19+
20+
21+
import os
22+
import sys
23+
24+
sys.path.append(os.path.abspath(os.path.dirname(__file__)))
25+
26+
from pywebcopy import core
27+
from pywebcopy import structures
28+
from pywebcopy import config
29+
from pywebcopy import utils
30+
from pywebcopy import generators
31+
from pywebcopy import exceptions
32+
33+
34+
__version__ = config.config['version']
2635

2736

2837
__all__ = [
2938
'__version__', '__author__', '__copyright__', '__license__', '__email__',
3039
'core', 'structures', 'config', 'utils', 'generators', 'exceptions'
3140
]
41+
42+

build/lib/pywebcopy/config.py

Lines changed: 15 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,15 @@
1212

1313
import os
1414
import re
15-
import core
16-
import utils
17-
import structures
18-
import exceptions
15+
core = __import__('core')
16+
utils = __import__('utils')
17+
structures = __import__('structures')
18+
exceptions = __import__('exceptions')
1919

2020

21-
__all__ = [
22-
'config', 'setup_config', 'update_config', 'reset_config'
23-
]
24-
2521

2622
global version
27-
version = '1.10'
23+
version = '2.0.0'
2824

2925

3026
config = structures.CaseInsensitiveDict({
@@ -46,14 +42,15 @@
4642
'LOAD_IMAGES': True,
4743
# to download js file or not
4844
'LOAD_JAVASCRIPT': True,
49-
# to download every page available inside url tree turn this True
50-
'COPY_ALL': False,
5145
# to overwrite the existing files if found
5246
'OVER_WRITE': False,
5347
# allowed file extensions
5448
'ALLOWED_FILE_EXT': ['.html', '.php', '.asp', '.htm', '.xhtml', '.css',
55-
'.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg',
56-
'.jpg', '.png', '.ttf', '.eot', '.otf', '.woff'],
49+
'.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf',
50+
'.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2',],
51+
52+
# Completely silents the script except 'trace' functions in debug mode
53+
'QUIET' : False,
5754
# log file path
5855
'LOG_FILE': None,
5956
# reduce log produced by removing unnecessary info from log file
@@ -77,17 +74,19 @@
7774

7875
# HANDLE WITH CARE
7976

77+
# to download every page available inside url tree turn this True
78+
'COPY_ALL': False,
8079
# user-agent of this scripts requests
8180
'USER_AGENT' : 'Mozilla/5.0 (PywebcopyBot/{})'.format(version),
8281
# dummy robots.txt class
83-
'ROBOTS' : structures.RobotsTxt(),
82+
'ROBOTS' : structures.RobotsTxt(''),
8483
# bypass sites policy
8584
'BYPASS_ROBOTS' : False
8685
})
8786

8887

8988
""" This is used in to store default config as backup """
90-
default_config = config
89+
default_config = dict(config)
9190

9291

9392
def update_config(**kwargs):
@@ -98,7 +97,7 @@ def update_config(**kwargs):
9897
def reset_config():
9998
""" Resets all to configuration to default state. """
10099
global config
101-
config = default_config
100+
config = structures.CaseInsensitiveDict(default_config)
102101

103102

104103

build/lib/pywebcopy/core.py

Lines changed: 67 additions & 65 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
from __future__ import print_function
1313

14+
1415
import datetime
1516
import shutil
1617
import sys
@@ -31,23 +32,17 @@
3132
else:
3233
raise ImportError("Error while importing Modules!")
3334

34-
import generators as gens
35-
import utils
36-
import config as cfg
37-
import structures
38-
import exceptions
39-
40-
35+
gens = __import__('generators')
36+
utils = __import__('utils')
37+
cfg = __import__('config')
38+
structures = __import__('structures')
39+
exceptions = __import__('exceptions')
4140

42-
__all__ = [
43-
'py3', 'py2', 'setup_config', 'get', 'now', 'save_webpage', 'wrap_up'
44-
]
4541

46-
47-
def save_webpage(url, mirrors_dir, reset_config=True, **kwargs):
42+
def save_webpage(url, mirrors_dir, reset_config=True, *args, **kwargs):
4843
""" Starts crawler, archives and writes logs etc. """
4944

50-
cfg.setup_config(url, mirrors_dir, **kwargs)
45+
cfg.setup_config(url, mirrors_dir, *args, **kwargs)
5146

5247
# save the page
5348
_crawl(cfg.config['URL'])
@@ -92,7 +87,7 @@ def wrap_up():
9287
)'''
9388

9489
# NOTE: new method, less error prone
95-
# make zip archive of all the files
90+
# make zip archive of all the files and not the empty folders
9691
archive = zipfile.ZipFile(
9792
os.path.abspath(cfg.config['MIRRORS_DIR']) +
9893
'.zip', 'w', zipfile.ZIP_DEFLATED
@@ -132,6 +127,9 @@ def wrap_up():
132127
def _can_access(user_agent, url):
133128
""" Determines if user-agent is allowed to access url. """
134129

130+
if cfg.config['robots'].is_dummy:
131+
return True
132+
135133
# check if website allows bot access
136134
if not cfg.config['ROBOTS'].can_fetch(user_agent, url) and not cfg.config['BYPASS_ROBOTS']:
137135

@@ -167,7 +165,7 @@ def get(url):
167165
"""
168166

169167
if not _can_access("*", url):
170-
raise exceptions.PermissionError("Access to %s not allowed by site." % url)
168+
raise exceptions.AccessError("Access to %s not allowed by site." % url)
171169

172170
headers = {
173171
"Accept-Language": "en-US,en;q=0.5",
@@ -192,15 +190,15 @@ def get(url):
192190
level=4,
193191
to_console=True
194192
)
195-
raise exceptions.ConnectionError(e.message)
193+
raise e
196194

197195
except requests.exceptions.InvalidSchema as e:
198196
now(
199197
'error :: Invalid URL',
200198
level=4,
201199
to_console=True
202200
)
203-
raise exceptions.InvalidUrl(e.message)
201+
raise e
204202

205203

206204
# -----------------------------------------------------------
@@ -292,7 +290,7 @@ def new_file(download_loc, content_url=None, content=None, mime_type='text/html'
292290
f.write(content)
293291
f.write(_water_mark)
294292

295-
# last check if file was successfully
293+
# last check if file was successfully written to the disk
296294
assert os.path.isfile(download_loc)
297295

298296
cfg.config['DOWNLOADED_FILES'].append(download_loc)
@@ -356,6 +354,9 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
356354
:param compressed: reduces the string length to 80 characters
357355
"""
358356

357+
if cfg.config['quiet']:
358+
return
359+
359360
_event_level_strings = ["info", "error", "critical", "success"]
360361

361362
if level == 4:
@@ -467,61 +468,62 @@ def _save_webpage(url):
467468
crawlable_urls = list()
468469

469470

470-
def _crawl(url):
471+
def _crawl(url, level=0, max_level=2):
471472
""" Scans pages for links to other pages to save in COPY_ALL mode.
472473
"""
473474

474475
# if single webpage is requested
475-
if cfg.config['copy_all'] == False:
476+
if not cfg.config['copy_all']:
477+
478+
_save_webpage(url)
479+
crawled_urls.append(url)
476480

477-
if url not in crawled_urls:
478-
_save_webpage(url)
479-
crawled_urls.append(url)
481+
return
480482

483+
# if max deep level is reached
484+
if level == max_level:
481485
return
482486

483-
else:
484-
# crawler to extract all the valid page links on the given page
485-
now('Trying to start crawler on url %s' % url)
486-
487-
# make a request to the page
488-
req = get(url)
489-
490-
# something went wrong; exit
491-
if req is None:
492-
now('Crawler encountered an Error while requesting web page on url %s' %
493-
url, level=4)
494-
now('Crawler Exiting!', level=4)
495-
sys.exit(1)
496-
497-
# page found and working
498-
# make a soup of it
499-
soup = bs4.BeautifulSoup(req.content, cfg.config['parser'])
500-
501-
# select all the links on page
502-
a_tags = soup.find_all('a', href=True)
503-
504-
# store absolute url of them in a separate dict
505-
for a_tag in a_tags:
506-
# create a absolute url
507-
_abs_url = utils.join_urls(url, a_tag.get('href', ''))
508-
509-
if _abs_url.startswith(url) and utils.url_path(url) not in ('', '/', '\\') and _abs_url not in crawled_urls and _abs_url not in crawlable_urls:
510-
crawlable_urls.append(url)
511-
512-
# iter through all the links of website
513-
for _url in crawlable_urls:
514-
# if url is already saved
515-
if _url in crawled_urls:
516-
# go to the next url
517-
continue
518-
519-
# otherwise save this url and add this to saved list
520-
_save_webpage(_url)
521-
crawled_urls.append(_url)
522-
523-
# send this url again for url searching
524-
_crawl(_url)
487+
# crawler to extract all the valid page links on the given page
488+
now('Trying to start crawler on url %s' % url, to_console=True)
489+
490+
# make a request to the page
491+
req = get(url)
492+
493+
# something went wrong; exit
494+
if not req.ok:
495+
now('Crawler encountered an Error while requesting web page on url %s' %
496+
url, level=4, to_console=True)
497+
now('Crawler Exiting!', level=4, to_console=True)
498+
sys.exit(1)
499+
500+
# page found and working
501+
# make a soup of it
502+
soup = bs4.BeautifulSoup(req.content, cfg.config['parser'])
503+
504+
# select all the links on page
505+
a_tags = soup.find_all('a', href=True)
506+
507+
# store absolute url of them
508+
global crawlable_urls
509+
crawlable_urls += set([urlparse.urljoin(url, i.get('href', ''))
510+
for i in a_tags if urlparse.urljoin(url, i.get('href', '')).startswith(url)])
511+
512+
# every url found will be checked and sent to be saved through the
513+
# save_webpage method
514+
for url in crawlable_urls:
515+
516+
# if url is already saved
517+
if url in crawled_urls:
518+
# go to the next url
519+
continue
520+
521+
# otherwise save this url and add this to saved list
522+
_save_webpage(url)
523+
crawled_urls.append(url)
524+
525+
# send this url again for url searching
526+
_crawl(url, level=(level + 1))
525527

526528
now("Crawled URL list : ")
527529
now('\n'.join(crawlable_urls))

build/lib/pywebcopy/exceptions.py

Lines changed: 9 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -4,40 +4,33 @@
44
aerwebcopy.exceptions
55
~~~~~~~~~~~~~~~
66
7-
* DO NOT TOUCH *
8-
97
Exceptions which can occur in aerwebcopy engine.
108
"""
119

1210

13-
class BaseError(Exception):
11+
class PywebcopyException(IOError):
1412
""" Base class for other exceptions which are defined. """
15-
pass
13+
def __init__(self, *args, **kwargs):
14+
super(PywebcopyException, self).__init__(*args, **kwargs)
1615

1716

18-
class PermissionError(BaseError):
17+
class AccessError(PywebcopyException):
1918
""" Access to resource not allowed. """
20-
pass
2119

2220

23-
class InvalidUrl(BaseError):
21+
class InvalidUrl(PywebcopyException):
2422
""" Supplied url is not a valid URL. """
25-
pass
2623

2724

28-
class InvalidFilename(BaseError):
25+
class InvalidFilename(PywebcopyException):
2926
""" Filename is either too long or contains special characters
3027
which are not supported by filesystem. """
31-
pass
3228

3329

34-
class UndefinedConfigValue(BaseError):
30+
class UndefinedConfigValue(PywebcopyException):
3531
""" If a specific configuration value is set to None """
36-
pass
3732

3833

39-
class ConnectionError(BaseError):
34+
class ConnectError(PywebcopyException):
4035
""" Internet connection is not found. """
41-
pass
42-
43-
36+

0 commit comments

Comments
 (0)