rajatomar788
diff --git a/‎build/lib/pywebcopy/__init__.py‎
Lines changed: 20 additions & 9 deletions b/‎build/lib/pywebcopy/__init__.py‎
Lines changed: 20 additions & 9 deletions
diff --git a/‎build/lib/pywebcopy/config.py‎
Lines changed: 15 additions & 16 deletions b/‎build/lib/pywebcopy/config.py‎
Lines changed: 15 additions & 16 deletions
diff --git a/‎build/lib/pywebcopy/core.py‎
Lines changed: 67 additions & 65 deletions b/‎build/lib/pywebcopy/core.py‎
Lines changed: 67 additions & 65 deletions
diff --git a/‎build/lib/pywebcopy/exceptions.py‎
Lines changed: 9 additions & 16 deletions b/‎build/lib/pywebcopy/exceptions.py‎
Lines changed: 9 additions & 16 deletions
@@ -9,23 +9,34 @@
 
 """
 
-import sys
-import core
-import structures
-import config
-import utils
-import generators
-import exceptions
-
+from __future__ import absolute_import
 
-__version__ = config.config['version']
 __author__ = 'Raja Tomar'
 __copyright__ = 'Copyright Aeroson Systems & Co.'
 __license__ = 'Licensed under MIT'
 __email__ = 'rajatomar788@gmail.com'
+__package__ = 'pywebcopy'
+
+
+import os
+import sys
+
+sys.path.append(os.path.abspath(os.path.dirname(__file__)))
+
+from pywebcopy import core
+from pywebcopy import structures
+from pywebcopy import config
+from pywebcopy import utils
+from pywebcopy import generators
+from pywebcopy import exceptions
+
+
+__version__ = config.config['version']
 
 
 __all__ = [
     '__version__', '__author__', '__copyright__', '__license__', '__email__',
     'core', 'structures', 'config', 'utils', 'generators', 'exceptions'
 ]
+
+
@@ -12,19 +12,15 @@
 
 import os
 import re
-import core
-import utils
-import structures
-import exceptions
+core = __import__('core')
+utils = __import__('utils')
+structures = __import__('structures')
+exceptions = __import__('exceptions')
 
 
-__all__ = [
-    'config', 'setup_config', 'update_config', 'reset_config'
-]
-
 
 global version
-version = '1.10'
+version = '2.0.0'
 
 
 config = structures.CaseInsensitiveDict({
@@ -46,14 +42,15 @@
     'LOAD_IMAGES': True,
     # to download js file or not
     'LOAD_JAVASCRIPT': True,
-    # to download every page available inside url tree turn this True
-    'COPY_ALL': False,
     # to overwrite the existing files if found
     'OVER_WRITE': False,
     # allowed file extensions
     'ALLOWED_FILE_EXT': ['.html', '.php', '.asp', '.htm', '.xhtml', '.css',
-                         '.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg',
-                         '.jpg', '.png', '.ttf', '.eot', '.otf', '.woff'],
+                         '.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf',
+                         '.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2',],
+
+    # Completely silents the script except 'trace' functions in debug mode
+    'QUIET' : False,
     # log file path
     'LOG_FILE': None,
     # reduce log produced by removing unnecessary info from log file
@@ -77,17 +74,19 @@
 
     # HANDLE WITH CARE
 
+    # to download every page available inside url tree turn this True
+    'COPY_ALL': False,
     # user-agent of this scripts requests
     'USER_AGENT' : 'Mozilla/5.0 (PywebcopyBot/{})'.format(version),
     # dummy robots.txt class
-    'ROBOTS' : structures.RobotsTxt(),
+    'ROBOTS' : structures.RobotsTxt(''),
     # bypass sites policy
     'BYPASS_ROBOTS' : False
 })
 
 
 """ This is used in to store default config as backup """
-default_config = config
+default_config = dict(config)
 
 
 def update_config(**kwargs):
@@ -98,7 +97,7 @@ def update_config(**kwargs):
 def reset_config():
     """ Resets all to configuration to default state. """
     global config
-    config = default_config
+    config = structures.CaseInsensitiveDict(default_config)
 
 
 
 
@@ -11,6 +11,7 @@
 
 from __future__ import print_function
 
+
 import datetime
 import shutil
 import sys
@@ -31,23 +32,17 @@
 else:
     raise ImportError("Error while importing Modules!")
 
-import generators as gens
-import utils
-import config as cfg
-import structures
-import exceptions 
-
-
+gens = __import__('generators')
+utils = __import__('utils')
+cfg = __import__('config')
+structures = __import__('structures')
+exceptions = __import__('exceptions')
 
-__all__ = [
-    'py3', 'py2', 'setup_config', 'get', 'now', 'save_webpage', 'wrap_up'
-]
 
-
-def save_webpage(url, mirrors_dir, reset_config=True, **kwargs):
+def save_webpage(url, mirrors_dir, reset_config=True, *args, **kwargs):
     """ Starts crawler, archives and writes logs etc. """
 
-    cfg.setup_config(url, mirrors_dir, **kwargs)
+    cfg.setup_config(url, mirrors_dir, *args, **kwargs)
 
     # save the page
     _crawl(cfg.config['URL'])
@@ -92,7 +87,7 @@ def wrap_up():
         )'''
 
         # NOTE: new method, less error prone
-        # make zip archive of all the files
+        # make zip archive of all the files and not the empty folders
         archive = zipfile.ZipFile(
             os.path.abspath(cfg.config['MIRRORS_DIR']) +
             '.zip', 'w', zipfile.ZIP_DEFLATED
@@ -132,6 +127,9 @@ def wrap_up():
 def _can_access(user_agent, url):
     """ Determines if user-agent is allowed to access url. """
 
+    if cfg.config['robots'].is_dummy:
+        return True
+
     # check if website allows bot access
     if not cfg.config['ROBOTS'].can_fetch(user_agent, url) and not cfg.config['BYPASS_ROBOTS']:
 
@@ -167,7 +165,7 @@ def get(url):
     """
 
     if not _can_access("*", url):
-        raise exceptions.PermissionError("Access to %s not allowed by site." % url)
+        raise exceptions.AccessError("Access to %s not allowed by site." % url)
 
     headers = {
         "Accept-Language": "en-US,en;q=0.5",
@@ -192,15 +190,15 @@ def get(url):
             level=4,
             to_console=True
         )
-        raise exceptions.ConnectionError(e.message)
+        raise e
 
     except requests.exceptions.InvalidSchema as e:
         now(
             'error :: Invalid URL',
             level=4,
             to_console=True
         )
-        raise exceptions.InvalidUrl(e.message)
+        raise e
 
 
 # -----------------------------------------------------------
@@ -292,7 +290,7 @@ def new_file(download_loc, content_url=None, content=None, mime_type='text/html'
         f.write(content)
         f.write(_water_mark)
 
-    # last check if file was successfully
+    # last check if file was successfully written to the disk
     assert os.path.isfile(download_loc)
 
     cfg.config['DOWNLOADED_FILES'].append(download_loc)
@@ -356,6 +354,9 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
     :param compressed: reduces the string length to 80 characters
     """
 
+    if cfg.config['quiet']:
+        return
+
     _event_level_strings = ["info", "error", "critical", "success"]
 
     if level == 4:
@@ -467,61 +468,62 @@ def _save_webpage(url):
 crawlable_urls = list()
 
 
-def _crawl(url):
+def _crawl(url, level=0, max_level=2):
     """ Scans pages for links to other pages to save in COPY_ALL mode.
     """
 
     # if single webpage is requested
-    if cfg.config['copy_all'] == False: 
+    if not cfg.config['copy_all']: 
+ 
+        _save_webpage(url)
+        crawled_urls.append(url)
 
-        if url not in crawled_urls:            
-            _save_webpage(url)
-            crawled_urls.append(url)
+        return
 
+    # if max deep level is reached
+    if level == max_level:
         return
 
-    else:
-        # crawler to extract all the valid page links on the given page
-        now('Trying to start crawler on url %s' % url)
-
-        # make a request to the page
-        req = get(url)
-
-        # something went wrong; exit
-        if req is None:
-            now('Crawler encountered an Error while requesting web page on url %s' %
-                url, level=4)
-            now('Crawler Exiting!', level=4)
-            sys.exit(1)
-
-        # page found and working
-        # make a soup of it
-        soup = bs4.BeautifulSoup(req.content, cfg.config['parser'])
-
-        # select all the links on page
-        a_tags = soup.find_all('a', href=True)
-
-        # store absolute url of them in a separate dict
-        for a_tag in a_tags:
-            # create a absolute url
-            _abs_url = utils.join_urls(url, a_tag.get('href', ''))
-
-            if _abs_url.startswith(url) and utils.url_path(url) not in ('', '/', '\\') and _abs_url not in crawled_urls and _abs_url not in crawlable_urls:
-                crawlable_urls.append(url)
-
-        # iter through all the links of website
-        for _url in crawlable_urls:
-            # if url is already saved
-            if _url in crawled_urls:
-                # go to the next url
-                continue
-
-            # otherwise save this url and add this to saved list
-            _save_webpage(_url)
-            crawled_urls.append(_url)
-
-            # send this url again for url searching
-            _crawl(_url)
+    # crawler to extract all the valid page links on the given page
+    now('Trying to start crawler on url %s' % url, to_console=True)
+
+    # make a request to the page
+    req = get(url)
+
+    # something went wrong; exit
+    if not req.ok:
+        now('Crawler encountered an Error while requesting web page on url %s' %
+            url, level=4, to_console=True)
+        now('Crawler Exiting!', level=4, to_console=True)
+        sys.exit(1)
+
+    # page found and working
+    # make a soup of it
+    soup = bs4.BeautifulSoup(req.content, cfg.config['parser'])
+
+    # select all the links on page
+    a_tags = soup.find_all('a', href=True)
+
+    # store absolute url of them
+    global crawlable_urls
+    crawlable_urls += set([urlparse.urljoin(url, i.get('href', ''))
+        for i in a_tags if urlparse.urljoin(url, i.get('href', '')).startswith(url)])
+    
+    # every url found will be checked and sent to be saved through the 
+    # save_webpage method
+    for url in crawlable_urls:
+    
+        # if url is already saved
+        if url in crawled_urls:
+            # go to the next url
+            continue
+
+        # otherwise save this url and add this to saved list
+        _save_webpage(url)
+        crawled_urls.append(url)
+
+        # send this url again for url searching
+        _crawl(url, level=(level + 1))
 
     now("Crawled URL list : ")
     now('\n'.join(crawlable_urls))
@@ -4,40 +4,33 @@
 aerwebcopy.exceptions
 ~~~~~~~~~~~~~~~
 
-* DO NOT TOUCH *
-
 Exceptions which can occur in aerwebcopy engine.
 """
 
 
-class BaseError(Exception):
+class PywebcopyException(IOError):
     """ Base class for other exceptions which are defined. """
-    pass
+    def __init__(self, *args, **kwargs):
+        super(PywebcopyException, self).__init__(*args, **kwargs)
 
 
-class PermissionError(BaseError):
+class AccessError(PywebcopyException):
     """ Access to resource not allowed. """
-    pass
 
 
-class InvalidUrl(BaseError):
+class InvalidUrl(PywebcopyException):
     """ Supplied url is not a valid URL. """
-    pass
 
 
-class InvalidFilename(BaseError):
+class InvalidFilename(PywebcopyException):
     """ Filename is either too long or contains special characters 
     which are not supported by filesystem. """
-    pass
 
 
-class UndefinedConfigValue(BaseError):
+class UndefinedConfigValue(PywebcopyException):
     """ If a specific configuration value is set to None """
-    pass
 
 
-class ConnectionError(BaseError):
+class ConnectError(PywebcopyException):
     """ Internet connection is not found. """
-    pass
-
-
+