Skip to content

Commit 5ab99f6

Browse files
committed
v5.0.0
1 parent 5786f13 commit 5ab99f6

39 files changed

+3933
-0
lines changed

.gitignore

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,13 @@
11
/.idea/*
22
/.vscode/*
3+
<<<<<<< HEAD
34
/.pytest_cache/*
45
/dist/*
56
*.pyc
67
/pywebcopy.egg-info/*
78
/requests-html-master/
9+
=======
10+
/dist/*
11+
*.pyc
12+
/pywebcopy.egg-info/*
13+
>>>>>>> v5.0.0

.vs/VSWorkspaceState.json

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
{
2+
"ExpandedNodes": [
3+
"",
4+
"\\pywebcopy"
5+
],
6+
"SelectedNode": "\\pywebcopy\\configs.py",
7+
"PreviewInSolutionExplorer": false
8+
}

.vs/slnx.sqlite

3.43 MB
Binary file not shown.

.vs/v5.x/v15/.suo

21 KB
Binary file not shown.

README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,8 @@
1+
<<<<<<< HEAD
12
# PyWebCopy &copy; 4
3+
=======
4+
# PyWebCopy &copy; 5
5+
>>>>>>> v5.0.0
26
37
`Created By : Raja Tomar`
48
`License : MIT`

app.py

Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
#!/usr/bin/env python3
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Tutorials for sample use-cases with pywebcopy.
6+
7+
This modules demos some general use cases when
8+
working with pywebcopy.
9+
10+
You can uncomment the functions which you like and modify
11+
its arguments to instantly get the results.
12+
"""
13+
14+
15+
import os
16+
import pywebcopy
17+
import time
18+
t = time.time()
19+
20+
pywebcopy.DEBUG = True
21+
# page_url = 'https://google.com/'
22+
page_url = 'https://www.w3schools.com/'
23+
# page_url = 'http://providenceri.iqm2.com/Citizens/Default.aspx'
24+
download_folder = os.path.join(os.getcwd(), 'saved')
25+
26+
'''
27+
If you are getting `pywebcopy.exceptions.AccessError` Exception.
28+
then check if website allows scraping of its content.
29+
30+
or
31+
32+
Uncomment the line below.
33+
'''
34+
# pywebcopy.config['bypass_robots'] = True
35+
36+
37+
"""
38+
If you want to overwrite existing files in the directory then
39+
use the over_write config key.
40+
41+
or
42+
43+
Uncomment the line below.
44+
"""
45+
# pywebcopy.config['over_write'] = True
46+
47+
48+
"""
49+
If you want to change the project name.
50+
use the project_name config key.
51+
52+
or
53+
54+
Uncomment the line below.
55+
"""
56+
# pywebcopy.config['project_name'] = 'my_project'
57+
58+
59+
60+
"""
61+
Save Single Webpage
62+
63+
Particular webpage can be saved easily using the following
64+
methods.
65+
66+
For `pywebcopy.exceptions.AccessError` use the code provided on top sections.
67+
68+
choose and uncomment the method which you like to use.
69+
"""
70+
71+
# method_1()
72+
# pywebcopy.save_webpage(project_url='http://google.com', project_folder='c://Saved_Webpages/',)
73+
74+
# :Deprecated in version > 2.x : method 2:
75+
# pywebcopy.config.config['bypass_robots'] = True
76+
# wp = pywebcopy.generators.AssetsGenerator('https://www.bing.com/', 'e://tests/')
77+
# wp.generate_style_map()
78+
# wp.save_to_disk()
79+
80+
# method 3:
81+
# pywebcopy.WebPage(page_url, download_folder).save_complete()
82+
83+
# Advanced Features in Test Phase
84+
85+
# :New in version 4: method 4:
86+
87+
# raw html is now also accepted
88+
# HTML = open('c:/users/raja/desktop/test.html').read()
89+
90+
# pywebcopy.WebPage(url='https://google.com/', project_folder='e://tests/pwc4/', HTML=HTML, over_write=True).save_complete()
91+
92+
93+
'''
94+
Whole Websites
95+
96+
Use caution when copying websites as this can overload or damage the
97+
servers of the site and rarely could be illegal, so check everything before
98+
you proceed.
99+
100+
101+
choose method and uncomment the method which you like.
102+
'''
103+
104+
# method 1:
105+
'''
106+
pywebcopy.config.setup_config(project_url='http://localhost:5000/', 'project_folder='e://tests/', project_name='LocalHost')
107+
crawler = pywebcopy.Crawler('http://localhost:5000/')
108+
crawler.crawl()
109+
'''
110+
111+
# method 2:
112+
113+
pywebcopy.save_webpage(page_url, download_folder)
114+
115+
116+
print("Execution time : ", time.time() - t)
117+

build/lib/pywebcopy/__init__.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
77
Python library to copy webpages.
88
"""
9+
<<<<<<< HEAD
910
'''
1011
__all__ = [
1112
'VERSION', 'LOGGER', 'SESSION',
@@ -19,10 +20,25 @@
1920
'save_webpage', 'save_website',
2021
]
2122
'''
23+
=======
24+
25+
__all__ = [
26+
'VERSION', 'LOGGER', 'SESSION',
27+
'utils',
28+
'config', 'URLTransformer', 'filename_present', 'url2path',
29+
'FileMixin', 'LinkTag', 'ScriptTag', 'ImgTag',
30+
'parse', 'parse_content',
31+
'get', 'new_file',
32+
'WebPage', 'Crawler',
33+
'save_webpage', 'save_website',
34+
]
35+
36+
>>>>>>> v5.0.0
2237
__author__ = 'Raja Tomar'
2338
__email__ = 'rajatomar788@gmail.com'
2439
__license__ = 'MIT License'
2540

41+
<<<<<<< HEAD
2642
import requests
2743
import logging
2844

@@ -73,4 +89,29 @@ def _n_fileLogger(file_path, mode):
7389
from pywebcopy.generators import AssetsGenerator
7490
from pywebcopy.core import get, new_file
7591
from pywebcopy.workers import Crawler, save_website, save_webpage
92+
=======
93+
import sys
94+
95+
this = sys.modules[__name__]
96+
this.DEBUG = False
97+
DEBUG = this.DEBUG
98+
VERSION = 'v5.0.0'
99+
100+
from .logger import LOGGER # Global Logger instance
101+
102+
103+
import requests
104+
SESSION = requests.Session()
105+
SESSION.__doc__ = """Global `requests` session object to store cookies in subsequent http requests."""
106+
107+
108+
from . import utils
109+
from .configs import config
110+
from .urls import URLTransformer, filename_present, url2path
111+
from .elements import FileMixin, LinkTag, ScriptTag, ImgTag
112+
from .parsers import parse, parse_content, BaseParser, Element
113+
from .webpage import WebPage, ElementsHandler, save_webpage
114+
from .core import get, new_file
115+
from .crawler import Crawler, save_website
116+
>>>>>>> v5.0.0
76117

build/lib/pywebcopy/__main__.py

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
# encoding: utf8
2+
3+
from __future__ import print_function
4+
5+
import os
6+
import sys
7+
import textwrap
8+
9+
from . import save_website, save_webpage
10+
11+
12+
USAGE = textwrap.dedent("""\
13+
Usage:
14+
pywebcopy [option] value [option2] value2 ...
15+
Options:
16+
-t # runs all available tests
17+
-p http://example.com/ -d /downloads/ # Save this webPage at /downloads/ folder
18+
-c http://example.com/ -d /downloads/ # Save this webSite at /downloads/ folder
19+
""")
20+
21+
args = sys.argv[1:]
22+
23+
if not args or args[0] not in ('-p', '-c', '-t'):
24+
print(USAGE)
25+
sys.exit(1)
26+
27+
if args[0] == '-t':
28+
os.system('{} -m unittest pywebcopy.tests'.format(sys.executable))
29+
30+
if args[0] == '-p':
31+
if len(args) < 2:
32+
print(USAGE)
33+
sys.exit(1)
34+
35+
if len(args) == 2:
36+
print("Saving {!r} in {!r}".format(args[1], os.getcwd()))
37+
save_webpage(args[1], os.getcwd())
38+
39+
elif len(args) == 4 and args[2] == '-d':
40+
print("Saving {!r} in {!r}".format(args[1], args[3]))
41+
save_webpage(args[1], args[3])
42+
43+
else:
44+
print(USAGE)
45+
sys.exit(1)
46+
47+
elif args[0] == '-c':
48+
if len(args) < 2:
49+
print(USAGE)
50+
sys.exit(1)
51+
52+
if len(args) == 2:
53+
print("Saving {!r} in {!r}".format(args[1], os.getcwd()))
54+
save_website(args[1], os.getcwd())
55+
56+
elif len(args) == 4 and args[2] == '-d':
57+
print("Saving {!r} in {!r}".format(args[1], args[3]))
58+
save_website(args[1], args[3])
59+
60+
else:
61+
print(USAGE)
62+
sys.exit(1)

0 commit comments

Comments
 (0)