Skip to content

Commit e993b1e

Browse files
committed
v2.1.0
1 parent f42a809 commit e993b1e

File tree

11 files changed

+146
-131
lines changed

11 files changed

+146
-131
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,11 @@
11
*.pyc
22
.vscode
3+
.idea
34
*.txt
45
*.cmd
56
*.bat
67

8+
dist/*
79

810
# Python egg metadata, regenerated from source files by setuptools.
911
/*.egg-info

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# PyWebCopy © 2.0.0
1+
# PyWebCopy © 2
22

33
`Created By : Raja Tomar`
44
`License : MIT`

app.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
2+
if __name__ == '__main__':
3+
import pywebcopy
4+
5+
pywebcopy.core.save_webpage('http://localhost:5000', 'e://tests/', copy_all=True, bypass_robots=True,)

dist/pywebcopy-2.0.2.tar.gz

-20.1 KB
Binary file not shown.

pywebcopy/__init__.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,3 @@
3131
'core', 'structures', 'config', 'utils', 'generators', 'exceptions'
3232
]
3333

34-
35-
if __name__ == "__main__":
36-
core.save_webpage('http://google.com', 'e://tests/', bypass_robots=True, over_write=True)

pywebcopy/config.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,9 @@
1212
import collections
1313
import os
1414
import re
15-
1615
from pywebcopy import exceptions
1716

18-
version = '2.0.2'
17+
version = '2.0.3'
1918

2019

2120
class CaseInsensitiveDict(collections.MutableMapping):
@@ -98,7 +97,7 @@ def __repr__(self):
9897
# to overwrite the existing files if found
9998
'OVER_WRITE': False,
10099
# allowed file extensions
101-
'ALLOWED_FILE_EXT': ['.html', '.php', '.asp', '.htm', '.xhtml', '.css',
100+
'ALLOWED_FILE_EXT': ['.html', '.php', '.asp', '.aspx' '.htm', '.xhtml', '.css',
102101
'.json', '.js', '.xml', '.svg', '.gif', '.ico', '.jpeg', '.pdf',
103102
'.jpg', '.png', '.ttf', '.eot', '.otf', '.woff', '.woff2',],
104103

@@ -176,12 +175,12 @@ def setup_config(url, download_loc, **kwargs):
176175
# check if the provided url works
177176
_dummy_request = core.get(url)
178177

178+
if not _dummy_request or not _dummy_request.ok:
179+
raise exceptions.ConnectError("Provided URL '%s' didn't work!" % url)
180+
179181
# new resolved url
180182
_url = _dummy_request.url
181183

182-
if not _dummy_request.ok:
183-
raise exceptions.ConnectError("Provided URL '%s' didn't work!" % url)
184-
185184
# Assign the resolved or found url so that it does not generate
186185
# error of redirection request
187186
config['URL'] = _url
@@ -203,7 +202,7 @@ def setup_config(url, download_loc, **kwargs):
203202

204203
# initialise the new robots parser so that we don't overrun websites
205204
# with copyright policies
206-
config['ROBOTS'] = create_robots_obj(_url + '/robots.txt')
205+
config['ROBOTS'] = create_robots_obj(utils.join_urls(_url, '/robots.txt'))
207206

208207
# create work dirs if it do not exists
209208
if not os.path.exists(config['mirrors_dir']):

pywebcopy/core.py

Lines changed: 73 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,8 @@
1515
import shutil
1616
import sys
1717
import zipfile
18-
18+
import threading
19+
import logging
1920
import os
2021
import bs4
2122
import requests
@@ -33,12 +34,31 @@
3334
from pywebcopy.exceptions import AccessError, InvalidUrl, ConnectError
3435
from pywebcopy import config as cfg
3536

37+
SESSION = requests.Session()
38+
LOGGER = logging.getLogger("pyebcopy")
39+
LOGGER.setLevel(logging.DEBUG)
40+
41+
CLOGGER = logging.StreamHandler()
42+
CLOGGER.setLevel(logging.ERROR)
43+
44+
45+
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
46+
formatter.datefmt = "%d-%b-%Y %H:%M:%S"
47+
CLOGGER.setFormatter(formatter)
48+
LOGGER.addHandler(CLOGGER)
49+
3650

3751
def save_webpage(url, mirrors_dir, reset_config=True, **kwargs):
3852
""" Starts crawler, archives and writes logs etc. """
3953

4054
cfg.setup_config(url, mirrors_dir, **kwargs)
4155

56+
# Add a log file writer to Logger
57+
FLOGGER = logging.FileHandler(cfg.config['log_file'])
58+
FLOGGER.setLevel(logging.DEBUG)
59+
FLOGGER.setFormatter(formatter)
60+
LOGGER.addHandler(FLOGGER)
61+
4262
# save the page
4363
_crawl(cfg.config['URL'])
4464

@@ -66,12 +86,6 @@ def wrap_up():
6686
cfg.config['DOWNLOADED_FILES'])
6787
)
6888

69-
# writes the buffered log to external file if buffering was on
70-
if cfg.config['LOG_BUFFERING']:
71-
with open(cfg.config['LOG_FILE'], 'a+') as log_file:
72-
log_file.write(
73-
'\n\n'.join(cfg.config['LOG_BUFFER_ARRAY'])
74-
)
7589

7690
if cfg.config['MAKE_ARCHIVE']:
7791

@@ -139,7 +153,7 @@ def _can_access(user_agent, url):
139153
now(
140154
'forced :: Accessing restricted website part %s' % url,
141155
to_console=True,
142-
level=4
156+
level=2
143157
)
144158
return True
145159

@@ -171,77 +185,73 @@ def get(url):
171185
try:
172186

173187
# make request to page
174-
req = requests.get(url, headers=headers)
188+
req = SESSION.get(url, headers=headers)
175189

176190
# log downloaded file size
177191
cfg.config['DOWNLOAD_SIZE'] += int(
178192
req.headers.get('content-length', 0))
179193

180194
return req
181195

182-
except requests.exceptions.ConnectionError:
196+
except Exception as e:
183197
now(
184198
'error :: Internet Connection not Found!',
185199
level=4,
186200
to_console=True
187201
)
188-
raise ConnectError("error :: Internet Connection not Found!")
202+
return
203+
189204

190-
except requests.exceptions.InvalidSchema:
191-
now(
192-
'error :: Invalid URL',
193-
level=4,
194-
to_console=True
195-
)
196-
raise InvalidUrl("Url is invalid! %s" % url)
197205

198206

199207
# -----------------------------------------------------------
200208
# toolkit func to generate a file with filename and data
201209
# it checks various expects before creating a file
202210
# but it is as easy as `new_file('pathtofile', 'filecontent')`
203211
# -----------------------------------------------------------
204-
def new_file(download_loc, content_url=None, content=None, mime_type='text/html'):
212+
def new_file(download_loc, content_url=None, content=None):
205213
""" Downloads any file to the disk.
206214
207215
:param download_loc: path where to save the file
208216
209217
:param content: contents or binary data of the file
210-
:param mime_type: content type of the provided content
211218
:OR:
212219
:param content_url: download the file from url
213220
214221
:returns: location of downloaded file on disk
215222
"""
216223

224+
if not download_loc or not (content or content_url):
225+
return download_loc
226+
217227
now('Saving file at %s path' % download_loc)
218228

219229
# if content of a file is to be filled through an content_url
220-
if content_url is not None:
230+
if content_url:
221231
now('Downloading file content from :: %s' % content_url)
222232

223233
try:
224234
# fetch the content_url
225235
req = get(content_url)
226236

227-
# get the file type from request
228-
mime_type = req.headers.get('Content-Type', mime_type)
237+
if not req or not req.ok:
238+
return download_loc
229239

230240
# store the content of the request
231241
content = req.content
232242

233-
except requests.exceptions.ConnectionError:
243+
except Exception as e:
234244
now(
235-
'error :: Failed to load the file from content_url %s'
236-
% content_url,
245+
'error :: Failed to load the file from content_url %s due to error %s'
246+
% (content_url, e.message),
237247
to_console=True,
238248
compressed=False,
239249
level=4
240250
)
241251
return download_loc
242252

243253
# if file of this type is allowed to be saved
244-
if not os.path.splitext(download_loc)[-1] in cfg.config['ALLOWED_FILE_EXT']:
254+
if not os.path.splitext(download_loc)[-1].lower() in cfg.config['ALLOWED_FILE_EXT']:
245255
now(
246256
'error :: file of type %s is not allowed!'
247257
% str(os.path.splitext(download_loc)[-1]),
@@ -265,27 +275,21 @@ def new_file(download_loc, content_url=None, content=None, mime_type='text/html'
265275
now('Existing file at %s removed!' % download_loc)
266276
os.remove(download_loc)
267277

268-
# Write the File
269-
with open(download_loc, 'wb') as f:
270-
271-
_water_mark = _watermark(content_url or download_loc)
278+
try:
279+
# Write the File
280+
with open(download_loc, 'wb') as f:
272281

273-
# if this is a text file write an extra watermark at top
274-
if not content_url or mime_type.split('/')[0] == 'text':
282+
_water_mark = _watermark(content_url or download_loc)
275283
f.write(_water_mark)
284+
f.write(content)
285+
f.write(_water_mark)
286+
except Exception as e:
287+
now("Exception occured during writing file %s exception %s" %(download_loc, e.message), level=4)
288+
return download_loc
276289

277-
f.write(content)
278-
f.write(_water_mark)
279-
280-
# last check if file was successfully written to the disk
281-
assert os.path.isfile(download_loc)
290+
cfg.config['downloaded_files'].append(download_loc)
282291

283-
cfg.config['DOWNLOADED_FILES'].append(download_loc)
284-
285-
now(
286-
'success :: File %s written Successfully!' % download_loc,
287-
to_console=True
288-
)
292+
now('success :: File %s written Successfully!' % download_loc, to_console=True)
289293

290294
# return the file path of the saved file
291295
return download_loc
@@ -300,17 +304,23 @@ def _watermark(file_path):
300304

301305
file_type = os.path.splitext(file_path)[-1]
302306

303-
if file_type == '.html':
307+
if file_type in ['.html', '.htm', '.xhtml', '.aspx', '.asp', '.php']:
304308
comment_style = '<!--!#-->'
305-
else:
309+
elif file_type in ['.css', '.js', '.xml']:
306310
comment_style = '/*!#*/'
307-
308-
mark = "\n* AerWebCopy [version {}]\n* Copyright Aeroson Systems & Co.\n* " \
309-
"File mirrored from {} \n* at {}\n".format(
310-
cfg.config['version'],
311-
os.path.basename(file_path),
312-
datetime.datetime.utcnow()
313-
)
311+
else:
312+
return b''
313+
314+
mark = """
315+
* AerWebCopy [version {}]
316+
* Copyright Aeroson Systems & Co.
317+
* File mirrored from {}
318+
* at {}
319+
""".format(
320+
cfg.config['version'],
321+
os.path.basename(file_path),
322+
datetime.datetime.utcnow()
323+
)
314324

315325
if py3:
316326
return bytes(comment_style.replace('#', mark), 'utf-8')
@@ -325,6 +335,7 @@ def _watermark(file_path):
325335
# of log then set cfg.config['DEBUG']=True and see what's going on inside
326336
# with ease
327337
# -----------------------------------------------------------------------
338+
328339
def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.config['LOG_FILE_COMPRESSION']):
329340
""" Writes any input string to external logfile
330341
@@ -341,20 +352,6 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
341352
:param compressed: reduces the string length to 80 characters
342353
"""
343354

344-
if cfg.config['quiet']:
345-
return
346-
347-
_event_level_strings = ["info", "error", "critical", "success"]
348-
349-
if level == 4:
350-
_event_level = _event_level_strings[2]
351-
elif level == 1 or level == 2:
352-
_event_level = _event_level_strings[3]
353-
elif level == 3:
354-
_event_level = _event_level_strings[1]
355-
else:
356-
_event_level = _event_level_strings[0]
357-
358355
# shorten the string
359356
if compressed:
360357
if len(string) > 80:
@@ -367,30 +364,19 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
367364
_caller = '<function {}>'.format(_caller)
368365

369366
# standardisation of the input string
370-
if compressed:
371-
_formatted_string = "[{}] [{}] {}".format(
372-
_caller, _event_level, string)
373-
else:
374-
_formatted_string = "[{}] [{}] [{}] {}".format(
375-
datetime.datetime.utcnow(), _caller, _event_level, string)
367+
_formatted_string = " [{}] {}".format(
368+
_caller, string)
369+
376370

377371
# if _debug switch is true than this will write now() instances to console
378372
# if string is requested to be printed to console also
379-
if cfg.config['DEBUG'] or to_console:
373+
if cfg.config['DEBUG'] or to_console and not cfg.config['quiet']:
380374
print(_formatted_string)
381375

382376
# if the location of log file is undefined; return
383-
if cfg.config['LOG_FILE'] is None:
384-
return
385-
386-
# append the string to log array
387-
if cfg.config['LOG_BUFFERING'] and not unbuffered:
388-
cfg.config['LOG_BUFFER_ARRAY'].append(_formatted_string)
377+
if not cfg.config['LOG_FILE']:
389378
return
390-
391-
with open(cfg.config['LOG_FILE'], 'a') as log_file:
392-
log_file.write(_formatted_string)
393-
log_file.write('\n\n')
379+
LOGGER.log(level * 10,_formatted_string)
394380

395381

396382
# main func that runs and downloads html source code of page
@@ -405,7 +391,7 @@ def _save_webpage(url):
405391
req = get(url)
406392

407393
# check if request was successful
408-
if not req.ok:
394+
if not req or not req.ok:
409395
now('Server Responded with an error!', level=4, to_console=True)
410396
now('Error code: %s' % str(req.status_code), to_console=True)
411397
return req.status_code
@@ -418,10 +404,10 @@ def _save_webpage(url):
418404
# create a path where to download this page
419405
download_path = gens.generate_path_for(url, filename_check=True, default_filename='index.html')
420406

421-
# store the file name
407+
# store the file name generated for the url
422408
file_comp = os.path.split(download_path)[-1]
423409

424-
# we have make sure the url have an filename e.g. 'http://site.com' not
410+
# the url may not have an filename e.g. 'http://site.com' not
425411
# have a file name and we have to add file name to it
426412
url = urlparse.urljoin(url, file_comp).strip('/')
427413

0 commit comments

Comments
 (0)