15
15
import shutil
16
16
import sys
17
17
import zipfile
18
-
18
+ import threading
19
+ import logging
19
20
import os
20
21
import bs4
21
22
import requests
33
34
from pywebcopy .exceptions import AccessError , InvalidUrl , ConnectError
34
35
from pywebcopy import config as cfg
35
36
37
+ SESSION = requests .Session ()
38
+ LOGGER = logging .getLogger ("pyebcopy" )
39
+ LOGGER .setLevel (logging .DEBUG )
40
+
41
+ CLOGGER = logging .StreamHandler ()
42
+ CLOGGER .setLevel (logging .ERROR )
43
+
44
+
45
+ formatter = logging .Formatter ('%(asctime)s - %(levelname)s - %(message)s' )
46
+ formatter .datefmt = "%d-%b-%Y %H:%M:%S"
47
+ CLOGGER .setFormatter (formatter )
48
+ LOGGER .addHandler (CLOGGER )
49
+
36
50
37
51
def save_webpage (url , mirrors_dir , reset_config = True , ** kwargs ):
38
52
""" Starts crawler, archives and writes logs etc. """
39
53
40
54
cfg .setup_config (url , mirrors_dir , ** kwargs )
41
55
56
+ # Add a log file writer to Logger
57
+ FLOGGER = logging .FileHandler (cfg .config ['log_file' ])
58
+ FLOGGER .setLevel (logging .DEBUG )
59
+ FLOGGER .setFormatter (formatter )
60
+ LOGGER .addHandler (FLOGGER )
61
+
42
62
# save the page
43
63
_crawl (cfg .config ['URL' ])
44
64
@@ -66,12 +86,6 @@ def wrap_up():
66
86
cfg .config ['DOWNLOADED_FILES' ])
67
87
)
68
88
69
- # writes the buffered log to external file if buffering was on
70
- if cfg .config ['LOG_BUFFERING' ]:
71
- with open (cfg .config ['LOG_FILE' ], 'a+' ) as log_file :
72
- log_file .write (
73
- '\n \n ' .join (cfg .config ['LOG_BUFFER_ARRAY' ])
74
- )
75
89
76
90
if cfg .config ['MAKE_ARCHIVE' ]:
77
91
@@ -139,7 +153,7 @@ def _can_access(user_agent, url):
139
153
now (
140
154
'forced :: Accessing restricted website part %s' % url ,
141
155
to_console = True ,
142
- level = 4
156
+ level = 2
143
157
)
144
158
return True
145
159
@@ -171,77 +185,73 @@ def get(url):
171
185
try :
172
186
173
187
# make request to page
174
- req = requests .get (url , headers = headers )
188
+ req = SESSION .get (url , headers = headers )
175
189
176
190
# log downloaded file size
177
191
cfg .config ['DOWNLOAD_SIZE' ] += int (
178
192
req .headers .get ('content-length' , 0 ))
179
193
180
194
return req
181
195
182
- except requests . exceptions . ConnectionError :
196
+ except Exception as e :
183
197
now (
184
198
'error :: Internet Connection not Found!' ,
185
199
level = 4 ,
186
200
to_console = True
187
201
)
188
- raise ConnectError ("error :: Internet Connection not Found!" )
202
+ return
203
+
189
204
190
- except requests .exceptions .InvalidSchema :
191
- now (
192
- 'error :: Invalid URL' ,
193
- level = 4 ,
194
- to_console = True
195
- )
196
- raise InvalidUrl ("Url is invalid! %s" % url )
197
205
198
206
199
207
# -----------------------------------------------------------
200
208
# toolkit func to generate a file with filename and data
201
209
# it checks various expects before creating a file
202
210
# but it is as easy as `new_file('pathtofile', 'filecontent')`
203
211
# -----------------------------------------------------------
204
- def new_file (download_loc , content_url = None , content = None , mime_type = 'text/html' ):
212
+ def new_file (download_loc , content_url = None , content = None ):
205
213
""" Downloads any file to the disk.
206
214
207
215
:param download_loc: path where to save the file
208
216
209
217
:param content: contents or binary data of the file
210
- :param mime_type: content type of the provided content
211
218
:OR:
212
219
:param content_url: download the file from url
213
220
214
221
:returns: location of downloaded file on disk
215
222
"""
216
223
224
+ if not download_loc or not (content or content_url ):
225
+ return download_loc
226
+
217
227
now ('Saving file at %s path' % download_loc )
218
228
219
229
# if content of a file is to be filled through an content_url
220
- if content_url is not None :
230
+ if content_url :
221
231
now ('Downloading file content from :: %s' % content_url )
222
232
223
233
try :
224
234
# fetch the content_url
225
235
req = get (content_url )
226
236
227
- # get the file type from request
228
- mime_type = req . headers . get ( 'Content-Type' , mime_type )
237
+ if not req or not req . ok :
238
+ return download_loc
229
239
230
240
# store the content of the request
231
241
content = req .content
232
242
233
- except requests . exceptions . ConnectionError :
243
+ except Exception as e :
234
244
now (
235
- 'error :: Failed to load the file from content_url %s'
236
- % content_url ,
245
+ 'error :: Failed to load the file from content_url %s due to error %s '
246
+ % ( content_url , e . message ) ,
237
247
to_console = True ,
238
248
compressed = False ,
239
249
level = 4
240
250
)
241
251
return download_loc
242
252
243
253
# if file of this type is allowed to be saved
244
- if not os .path .splitext (download_loc )[- 1 ] in cfg .config ['ALLOWED_FILE_EXT' ]:
254
+ if not os .path .splitext (download_loc )[- 1 ]. lower () in cfg .config ['ALLOWED_FILE_EXT' ]:
245
255
now (
246
256
'error :: file of type %s is not allowed!'
247
257
% str (os .path .splitext (download_loc )[- 1 ]),
@@ -265,27 +275,21 @@ def new_file(download_loc, content_url=None, content=None, mime_type='text/html'
265
275
now ('Existing file at %s removed!' % download_loc )
266
276
os .remove (download_loc )
267
277
268
- # Write the File
269
- with open (download_loc , 'wb' ) as f :
270
-
271
- _water_mark = _watermark (content_url or download_loc )
278
+ try :
279
+ # Write the File
280
+ with open (download_loc , 'wb' ) as f :
272
281
273
- # if this is a text file write an extra watermark at top
274
- if not content_url or mime_type .split ('/' )[0 ] == 'text' :
282
+ _water_mark = _watermark (content_url or download_loc )
275
283
f .write (_water_mark )
284
+ f .write (content )
285
+ f .write (_water_mark )
286
+ except Exception as e :
287
+ now ("Exception occured during writing file %s exception %s" % (download_loc , e .message ), level = 4 )
288
+ return download_loc
276
289
277
- f .write (content )
278
- f .write (_water_mark )
279
-
280
- # last check if file was successfully written to the disk
281
- assert os .path .isfile (download_loc )
290
+ cfg .config ['downloaded_files' ].append (download_loc )
282
291
283
- cfg .config ['DOWNLOADED_FILES' ].append (download_loc )
284
-
285
- now (
286
- 'success :: File %s written Successfully!' % download_loc ,
287
- to_console = True
288
- )
292
+ now ('success :: File %s written Successfully!' % download_loc , to_console = True )
289
293
290
294
# return the file path of the saved file
291
295
return download_loc
@@ -300,17 +304,23 @@ def _watermark(file_path):
300
304
301
305
file_type = os .path .splitext (file_path )[- 1 ]
302
306
303
- if file_type == '.html' :
307
+ if file_type in [ '.html' , '.htm' , '.xhtml' , '.aspx' , '.asp' , '.php' ] :
304
308
comment_style = '<!--!#-->'
305
- else :
309
+ elif file_type in [ '.css' , '.js' , '.xml' ] :
306
310
comment_style = '/*!#*/'
307
-
308
- mark = "\n * AerWebCopy [version {}]\n * Copyright Aeroson Systems & Co.\n * " \
309
- "File mirrored from {} \n * at {}\n " .format (
310
- cfg .config ['version' ],
311
- os .path .basename (file_path ),
312
- datetime .datetime .utcnow ()
313
- )
311
+ else :
312
+ return b''
313
+
314
+ mark = """
315
+ * AerWebCopy [version {}]
316
+ * Copyright Aeroson Systems & Co.
317
+ * File mirrored from {}
318
+ * at {}
319
+ """ .format (
320
+ cfg .config ['version' ],
321
+ os .path .basename (file_path ),
322
+ datetime .datetime .utcnow ()
323
+ )
314
324
315
325
if py3 :
316
326
return bytes (comment_style .replace ('#' , mark ), 'utf-8' )
@@ -325,6 +335,7 @@ def _watermark(file_path):
325
335
# of log then set cfg.config['DEBUG']=True and see what's going on inside
326
336
# with ease
327
337
# -----------------------------------------------------------------------
338
+
328
339
def now (string , level = 0 , unbuffered = False , to_console = False , compressed = cfg .config ['LOG_FILE_COMPRESSION' ]):
329
340
""" Writes any input string to external logfile
330
341
@@ -341,20 +352,6 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
341
352
:param compressed: reduces the string length to 80 characters
342
353
"""
343
354
344
- if cfg .config ['quiet' ]:
345
- return
346
-
347
- _event_level_strings = ["info" , "error" , "critical" , "success" ]
348
-
349
- if level == 4 :
350
- _event_level = _event_level_strings [2 ]
351
- elif level == 1 or level == 2 :
352
- _event_level = _event_level_strings [3 ]
353
- elif level == 3 :
354
- _event_level = _event_level_strings [1 ]
355
- else :
356
- _event_level = _event_level_strings [0 ]
357
-
358
355
# shorten the string
359
356
if compressed :
360
357
if len (string ) > 80 :
@@ -367,30 +364,19 @@ def now(string, level=0, unbuffered=False, to_console=False, compressed=cfg.conf
367
364
_caller = '<function {}>' .format (_caller )
368
365
369
366
# standardisation of the input string
370
- if compressed :
371
- _formatted_string = "[{}] [{}] {}" .format (
372
- _caller , _event_level , string )
373
- else :
374
- _formatted_string = "[{}] [{}] [{}] {}" .format (
375
- datetime .datetime .utcnow (), _caller , _event_level , string )
367
+ _formatted_string = " [{}] {}" .format (
368
+ _caller , string )
369
+
376
370
377
371
# if _debug switch is true than this will write now() instances to console
378
372
# if string is requested to be printed to console also
379
- if cfg .config ['DEBUG' ] or to_console :
373
+ if cfg .config ['DEBUG' ] or to_console and not cfg . config [ 'quiet' ] :
380
374
print (_formatted_string )
381
375
382
376
# if the location of log file is undefined; return
383
- if cfg .config ['LOG_FILE' ] is None :
384
- return
385
-
386
- # append the string to log array
387
- if cfg .config ['LOG_BUFFERING' ] and not unbuffered :
388
- cfg .config ['LOG_BUFFER_ARRAY' ].append (_formatted_string )
377
+ if not cfg .config ['LOG_FILE' ]:
389
378
return
390
-
391
- with open (cfg .config ['LOG_FILE' ], 'a' ) as log_file :
392
- log_file .write (_formatted_string )
393
- log_file .write ('\n \n ' )
379
+ LOGGER .log (level * 10 ,_formatted_string )
394
380
395
381
396
382
# main func that runs and downloads html source code of page
@@ -405,7 +391,7 @@ def _save_webpage(url):
405
391
req = get (url )
406
392
407
393
# check if request was successful
408
- if not req .ok :
394
+ if not req or not req .ok :
409
395
now ('Server Responded with an error!' , level = 4 , to_console = True )
410
396
now ('Error code: %s' % str (req .status_code ), to_console = True )
411
397
return req .status_code
@@ -418,10 +404,10 @@ def _save_webpage(url):
418
404
# create a path where to download this page
419
405
download_path = gens .generate_path_for (url , filename_check = True , default_filename = 'index.html' )
420
406
421
- # store the file name
407
+ # store the file name generated for the url
422
408
file_comp = os .path .split (download_path )[- 1 ]
423
409
424
- # we have make sure the url have an filename e.g. 'http://site.com' not
410
+ # the url may not have an filename e.g. 'http://site.com' not
425
411
# have a file name and we have to add file name to it
426
412
url = urlparse .urljoin (url , file_comp ).strip ('/' )
427
413
0 commit comments