Skip to content

Commit 1b9d0d4

Browse files
Added DPULSE main files
1 parent dc228ff commit 1b9d0d4

File tree

5 files changed

+411
-0
lines changed

5 files changed

+411
-0
lines changed

crawl_processor.py

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
"""
2+
crawl_processor module
3+
4+
Contains the functions to process search of open-sourced information connected with specified domain
5+
Search results are returned to report_creation module in order to create .pdf report
6+
7+
Arguments:
8+
short_domain: website address which you enter in console
9+
url: http://short_domain/
10+
"""
11+
12+
import socket
13+
import whois
14+
import re
15+
import requests
16+
import urllib.parse
17+
from urllib.parse import urlparse
18+
from collections import defaultdict
19+
from bs4 import BeautifulSoup
20+
from time import sleep
21+
from requests import get
22+
from fake_useragent import UserAgent
23+
24+
def ip_gather(short_domain):
25+
"""
26+
Function for getting IP address of website
27+
"""
28+
print('Processing IP gathering from {}'.format(short_domain))
29+
ip_address = socket.gethostbyname(short_domain)
30+
return ip_address
31+
32+
def whois_gather(short_domain):
33+
"""
34+
Function for getting WHOIS information of website
35+
"""
36+
w = whois.whois(short_domain)
37+
return w
38+
39+
def mail_gather(url):
40+
"""
41+
Function for getting emails from website elements
42+
"""
43+
r = requests.get(url)
44+
data = r.text
45+
soup = BeautifulSoup(data, "html.parser")
46+
mails = []
47+
for i in soup.find_all(href=re.compile("mailto")):
48+
i.encode().decode()
49+
mails.append(i.string)
50+
return mails
51+
52+
def subdomains_gather(url, short_domain):
53+
"""
54+
Function for subdomain search
55+
"""
56+
print('Processing subdomain gathering from {}'.format(url))
57+
response = requests.get(url)
58+
soup = BeautifulSoup(response.text, 'html.parser')
59+
linked_domains = set()
60+
61+
for link in soup.find_all('a', href=True):
62+
domain = urlparse(link['href']).netloc
63+
if domain and domain != urlparse(url).netloc:
64+
linked_domains.add(domain)
65+
66+
finder = short_domain
67+
subdomains = [urllib.parse.unquote(i) for i in linked_domains if finder in i]
68+
return subdomains
69+
70+
def sm_gather(url):
71+
"""
72+
Function for getting some basic social networks links from website elements
73+
"""
74+
print('Processing social medias gathering from {}'.format(url))
75+
response = requests.get(url)
76+
soup = BeautifulSoup(response.text, 'html.parser')
77+
links = [a['href'] for a in soup.find_all('a', href=True)]
78+
categorized_links = {'Facebook': [], 'Twitter': [], 'Instagram': [],
79+
'Telegram': [], 'TikTok': [], 'LinkedIn': [],
80+
'VKontakte': [], 'YouTube': []}
81+
82+
for link in links:
83+
if 'facebook.com' in link:
84+
categorized_links['Facebook'].append(urllib.parse.unquote(link))
85+
elif 'twitter.com' in link:
86+
categorized_links['Twitter'].append(urllib.parse.unquote(link))
87+
elif 'instagram.com' in link:
88+
categorized_links['Instagram'].append(urllib.parse.unquote(link))
89+
elif 't.me' in link:
90+
categorized_links['Telegram'].append(urllib.parse.unquote(link))
91+
elif 'tiktok.com' in link:
92+
categorized_links['TikTok'].append(urllib.parse.unquote(link))
93+
elif 'linkedin.com' in link:
94+
categorized_links['LinkedIn'].append(urllib.parse.unquote(link))
95+
elif 'vk.com' in link:
96+
categorized_links['VKontakte'].append(urllib.parse.unquote(link))
97+
elif 'youtube.com' in link:
98+
categorized_links['YouTube'].append(urllib.parse.unquote(link))
99+
return categorized_links
100+
101+
def domains_reverse_research(subdomains):
102+
"""
103+
Subdomain reverse search function which extracts social networks, emails and IP addresses
104+
"""
105+
subdomain_urls = []
106+
subdomain_mails = []
107+
subdomain_socials = []
108+
subdomain_ip = []
109+
110+
try:
111+
for subdomain in subdomains:
112+
subdomain_url = "http://" + subdomain + "/"
113+
subdomain_urls.append(subdomain_url)
114+
except (socket.gaierror, requests.exceptions.SSLError, requests.exceptions.ConnectionError):
115+
print('URL unreachable')
116+
pass
117+
118+
try:
119+
for subdomain in subdomains:
120+
subdomains_ip = ip_gather(subdomain)
121+
subdomain_ip.append(subdomains_ip)
122+
subdomain_ip = list(set(subdomain_ip))
123+
except (socket.gaierror, requests.exceptions.SSLError, requests.exceptions.ConnectionError):
124+
print('URL unreachable')
125+
pass
126+
127+
try:
128+
for subdomain_url in subdomain_urls:
129+
subdomain_mail = mail_gather(subdomain_url)
130+
subdomain_mails.append(subdomain_mail)
131+
subdomain_social = sm_gather(subdomain_url)
132+
subdomain_socials.append(subdomain_social)
133+
except (socket.gaierror, requests.exceptions.SSLError, requests.exceptions.ConnectionError):
134+
print('URL unreachable')
135+
pass
136+
137+
subdomain_ip = ''.join(subdomain_ip)
138+
subdomain_mails = [sublist for sublist in subdomain_mails if sublist]
139+
subdomain_mails = [sublist for sublist in subdomain_mails if sublist != [None]]
140+
subdomain_mails = ', '.join([', '.join(map(str, sublist)) for sublist in subdomain_mails])
141+
subdomain_socials = [{k: v for k, v in d.items() if v} for d in subdomain_socials]
142+
subdomain_socials = [d for d in subdomain_socials if d]
143+
subdomain_socials_grouped = defaultdict(list)
144+
145+
for d in subdomain_socials:
146+
for key, value in d.items():
147+
subdomain_socials_grouped[key].extend(value)
148+
149+
subdomain_socials_grouped = list(dict(subdomain_socials_grouped).values())
150+
151+
sd_socials = {'Facebook': [], 'Twitter': [], 'Instagram': [], 'Telegram': [], 'TikTok': [], 'LinkedIn': [],
152+
'VKontakte': [], 'YouTube': []}
153+
154+
for inner_list in subdomain_socials_grouped:
155+
for link in inner_list:
156+
if 'facebook.com' in link:
157+
sd_socials['Facebook'].append(urllib.parse.unquote(link))
158+
elif 'twitter.com' in link:
159+
sd_socials['Twitter'].append(urllib.parse.unquote(link))
160+
elif 'instagram.com' in link:
161+
sd_socials['Instagram'].append(urllib.parse.unquote(link))
162+
elif 't.me' in link:
163+
sd_socials['Telegram'].append(urllib.parse.unquote(link))
164+
elif 'tiktok.com' in link:
165+
sd_socials['TikTok'].append(urllib.parse.unquote(link))
166+
elif 'linkedin.com' in link:
167+
sd_socials['LinkedIn'].append(urllib.parse.unquote(link))
168+
elif 'vk.com' in link:
169+
sd_socials['VKontakte'].append(urllib.parse.unquote(link))
170+
elif 'youtube.com' in link:
171+
sd_socials['YouTube'].append(urllib.parse.unquote(link))
172+
173+
return subdomain_mails, sd_socials, subdomain_ip
174+
175+
176+
def preset(search_query, results, lang, start, timeout):
177+
"""
178+
Preset function for Google Dorking
179+
"""
180+
ua = UserAgent()
181+
resp = get(
182+
url="https://www.google.com/search",
183+
headers={
184+
"User-Agent": ua.random
185+
},
186+
params={
187+
"q": search_query,
188+
"num": results + 2,
189+
"hl": lang,
190+
"start": start,
191+
},
192+
timeout=timeout,
193+
)
194+
195+
resp.raise_for_status()
196+
return resp
197+
198+
def dorking_processing(short_domain, num_results, lang="en", sleep_interval=0, timeout=5):
199+
"""
200+
Google Dorking automatization function
201+
"""
202+
print('Processing Google Dorking')
203+
search_queries = ['"{}" filetype:pdf OR filetype:xlsx OR filetype:docx OR filetype:PPT'.format(short_domain),
204+
'{} site:linkedin.com/in/'.format(short_domain),
205+
'related: {}'.format(short_domain),
206+
'info: {}'.format(short_domain)]
207+
all_results = []
208+
for search_query in search_queries:
209+
start = 0
210+
results = []
211+
while start < num_results:
212+
resp = preset(search_query, num_results - start,
213+
lang, start, timeout)
214+
215+
soup = BeautifulSoup(resp.text, "html.parser")
216+
result_block = soup.find_all("div", attrs={"class": "g"})
217+
if len(result_block) == 0:
218+
start += 1
219+
for result in result_block:
220+
link = result.find("a", href=True)
221+
title = result.find("h3")
222+
description_box = result.find(
223+
"div", {"style": "-webkit-line-clamp:2"})
224+
if description_box:
225+
description = description_box.text
226+
if link and title and description:
227+
start += 1
228+
results.append(urllib.parse.unquote(link["href"]))
229+
230+
sleep(sleep_interval)
231+
all_results.append(results)
232+
233+
return (''.join(f'</p>{item}</p>' for item in all_results[0]), ''.join(f'{item}</p>' for item in all_results[1]), ''.join(f'{item}</p>' for item in all_results[2]),
234+
''.join(f'{item}</p>' for item in all_results[3]))

dpulse.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
Program start point
3+
4+
You can call this script from yours system terminal: python main.py -u "url" -ra "number"
5+
6+
"""
7+
8+
import report_creation as rc
9+
import argparse
10+
11+
attributes_parser = argparse.ArgumentParser(description='OSINT methodology tool')
12+
13+
attributes_parser.add_argument('-sd',
14+
'--url',
15+
action='store',
16+
type=str,
17+
required=True,
18+
help='Attribute which contains website to research and investigate. Should be shorted: google.com, github.com')
19+
20+
attributes_parser.add_argument('-ra',
21+
'--ra',
22+
action='store',
23+
type=int,
24+
required=True,
25+
help='Attribute which specify output amount of Google Dorking.')
26+
27+
args = attributes_parser.parse_args()
28+
29+
print(f"Processing scan of {args.url}")
30+
31+
url = "http://" + str(args.url) + "/"
32+
33+
rc.create_report(str(args.url), url, args.ra)

report_creation.py

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
"""
2+
report_creation module
3+
4+
Contains the function to collect all the outputs from crawl_processor functions and compile them into PDF report
5+
6+
Arguments:
7+
short_domain: website address (-sd value) which you enter in console
8+
url: http://short_domain/
9+
n: -ra amount that you which you enter in console
10+
"""
11+
12+
from datetime import datetime
13+
import jinja2
14+
import pdfkit
15+
import os
16+
import crawl_processor as cp
17+
18+
def find_wkhtmltopdf():
19+
"""
20+
Function which will find wkhtmltopdf executionable file
21+
"""
22+
root_directory = os.getcwd()
23+
filename = 'wkhtmltopdf.exe'
24+
for root, dirs, files in os.walk(root_directory):
25+
if filename in files:
26+
return os.path.join(root, filename)
27+
return None
28+
29+
short_domain = ''
30+
def report_encoding_config():
31+
"""
32+
Function which sets some configurations for PDF report file
33+
"""
34+
return {
35+
'encoding': 'UTF-8',
36+
'enable-local-file-access': True
37+
}
38+
39+
search_query = []
40+
def create_report(short_domain, url, n):
41+
"""
42+
Functions which calls all the functions from crawl_processor module and compiles them into PDF report.
43+
PDF report will be saved in main script directory
44+
"""
45+
res = cp.whois_gather(short_domain)
46+
subdomains = cp.subdomains_gather(url, short_domain)
47+
social_medias = cp.sm_gather(url)
48+
subdomain_mails, sd_socials, subdomain_ip = cp.domains_reverse_research(subdomains)
49+
exp_docs, linkedin, related_pages, dinfo = cp.dorking_processing(short_domain, num_results=n, lang="en", sleep_interval=0, timeout=5)
50+
51+
ctime = datetime.now().strftime('%Y-%m-%d, %Hh%Mm%Ss')
52+
casename = short_domain.replace(".", "") + '~' + ctime + '.pdf'
53+
54+
context = {'sh_domain': short_domain, 'full_url': url, 'ip_address': cp.ip_gather(short_domain),'registrar': res['registrar'],
55+
'creation_date': res['creation_date'],'expiration_date': res['expiration_date'],
56+
'name_servers': ', '.join(res['name_servers']),'org': res['org'],
57+
'mails': cp.mail_gather(url), 'subdomain_mails': subdomain_mails, 'subdomain_socials': social_medias,
58+
'subdomain_ip': subdomain_ip, 'fb_links_s': ', '.join(sd_socials['Facebook']), 'inst_links_s': ', '.join(sd_socials['Instagram']), 'tw_links_s': ', '.join(sd_socials['Twitter']),
59+
'tg_links_s': ', '.join(sd_socials['Telegram']), 'tt_links_s': ', '.join(sd_socials['TikTok']),
60+
'li_links_s': ', '.join(sd_socials['LinkedIn']), 'vk_links_s': ', '.join(sd_socials['VKontakte']), 'yt_links_s': ', '.join(sd_socials['YouTube']),
61+
'subdomains': ', '.join(subdomains), 'fb_links': ', '.join(social_medias['Facebook']),
62+
'tw_links': ', '.join(social_medias['Twitter']), 'inst_links': ', '.join(social_medias['Instagram']),
63+
'tg_links': ', '.join(social_medias['Telegram']), 'tt_links': ', '.join(social_medias['TikTok']),
64+
'li_links': ', '.join(social_medias['LinkedIn']), 'vk_links': ', '.join(social_medias['VKontakte']),
65+
'yt_links': ', '.join(social_medias['YouTube']), 'exp_docs': exp_docs, 'linkedin': linkedin, 'related_pages': related_pages,
66+
'dinfo': dinfo, 'ctime': ctime}
67+
68+
print('Processing report for {} case...'.format(short_domain))
69+
70+
template_loader = jinja2.FileSystemLoader('./')
71+
template_env = jinja2.Environment(loader=template_loader)
72+
73+
template = template_env.get_template('report_template.html')
74+
output_text = template.render(context)
75+
76+
current_script = os.path.realpath(__file__)
77+
current_directory = os.path.dirname(current_script)
78+
print(current_directory)
79+
file_path = os.path.join(current_directory, find_wkhtmltopdf())
80+
config = pdfkit.configuration(wkhtmltopdf=file_path)
81+
pdfkit.from_string(output_text, casename, configuration=config, options=report_encoding_config())
82+
msg = "Report for {} case was created at {}".format(''.join(short_domain), ctime)
83+
print(msg)

0 commit comments

Comments
 (0)