1
+ """
2
+ crawl_processor module
3
+
4
+ Contains the functions to process search of open-sourced information connected with specified domain
5
+ Search results are returned to report_creation module in order to create .pdf report
6
+
7
+ Arguments:
8
+ short_domain: website address which you enter in console
9
+ url: http://short_domain/
10
+ """
11
+
12
+ import socket
13
+ import whois
14
+ import re
15
+ import requests
16
+ import urllib .parse
17
+ from urllib .parse import urlparse
18
+ from collections import defaultdict
19
+ from bs4 import BeautifulSoup
20
+ from time import sleep
21
+ from requests import get
22
+ from fake_useragent import UserAgent
23
+
24
+ def ip_gather (short_domain ):
25
+ """
26
+ Function for getting IP address of website
27
+ """
28
+ print ('Processing IP gathering from {}' .format (short_domain ))
29
+ ip_address = socket .gethostbyname (short_domain )
30
+ return ip_address
31
+
32
+ def whois_gather (short_domain ):
33
+ """
34
+ Function for getting WHOIS information of website
35
+ """
36
+ w = whois .whois (short_domain )
37
+ return w
38
+
39
+ def mail_gather (url ):
40
+ """
41
+ Function for getting emails from website elements
42
+ """
43
+ r = requests .get (url )
44
+ data = r .text
45
+ soup = BeautifulSoup (data , "html.parser" )
46
+ mails = []
47
+ for i in soup .find_all (href = re .compile ("mailto" )):
48
+ i .encode ().decode ()
49
+ mails .append (i .string )
50
+ return mails
51
+
52
+ def subdomains_gather (url , short_domain ):
53
+ """
54
+ Function for subdomain search
55
+ """
56
+ print ('Processing subdomain gathering from {}' .format (url ))
57
+ response = requests .get (url )
58
+ soup = BeautifulSoup (response .text , 'html.parser' )
59
+ linked_domains = set ()
60
+
61
+ for link in soup .find_all ('a' , href = True ):
62
+ domain = urlparse (link ['href' ]).netloc
63
+ if domain and domain != urlparse (url ).netloc :
64
+ linked_domains .add (domain )
65
+
66
+ finder = short_domain
67
+ subdomains = [urllib .parse .unquote (i ) for i in linked_domains if finder in i ]
68
+ return subdomains
69
+
70
+ def sm_gather (url ):
71
+ """
72
+ Function for getting some basic social networks links from website elements
73
+ """
74
+ print ('Processing social medias gathering from {}' .format (url ))
75
+ response = requests .get (url )
76
+ soup = BeautifulSoup (response .text , 'html.parser' )
77
+ links = [a ['href' ] for a in soup .find_all ('a' , href = True )]
78
+ categorized_links = {'Facebook' : [], 'Twitter' : [], 'Instagram' : [],
79
+ 'Telegram' : [], 'TikTok' : [], 'LinkedIn' : [],
80
+ 'VKontakte' : [], 'YouTube' : []}
81
+
82
+ for link in links :
83
+ if 'facebook.com' in link :
84
+ categorized_links ['Facebook' ].append (urllib .parse .unquote (link ))
85
+ elif 'twitter.com' in link :
86
+ categorized_links ['Twitter' ].append (urllib .parse .unquote (link ))
87
+ elif 'instagram.com' in link :
88
+ categorized_links ['Instagram' ].append (urllib .parse .unquote (link ))
89
+ elif 't.me' in link :
90
+ categorized_links ['Telegram' ].append (urllib .parse .unquote (link ))
91
+ elif 'tiktok.com' in link :
92
+ categorized_links ['TikTok' ].append (urllib .parse .unquote (link ))
93
+ elif 'linkedin.com' in link :
94
+ categorized_links ['LinkedIn' ].append (urllib .parse .unquote (link ))
95
+ elif 'vk.com' in link :
96
+ categorized_links ['VKontakte' ].append (urllib .parse .unquote (link ))
97
+ elif 'youtube.com' in link :
98
+ categorized_links ['YouTube' ].append (urllib .parse .unquote (link ))
99
+ return categorized_links
100
+
101
+ def domains_reverse_research (subdomains ):
102
+ """
103
+ Subdomain reverse search function which extracts social networks, emails and IP addresses
104
+ """
105
+ subdomain_urls = []
106
+ subdomain_mails = []
107
+ subdomain_socials = []
108
+ subdomain_ip = []
109
+
110
+ try :
111
+ for subdomain in subdomains :
112
+ subdomain_url = "http://" + subdomain + "/"
113
+ subdomain_urls .append (subdomain_url )
114
+ except (socket .gaierror , requests .exceptions .SSLError , requests .exceptions .ConnectionError ):
115
+ print ('URL unreachable' )
116
+ pass
117
+
118
+ try :
119
+ for subdomain in subdomains :
120
+ subdomains_ip = ip_gather (subdomain )
121
+ subdomain_ip .append (subdomains_ip )
122
+ subdomain_ip = list (set (subdomain_ip ))
123
+ except (socket .gaierror , requests .exceptions .SSLError , requests .exceptions .ConnectionError ):
124
+ print ('URL unreachable' )
125
+ pass
126
+
127
+ try :
128
+ for subdomain_url in subdomain_urls :
129
+ subdomain_mail = mail_gather (subdomain_url )
130
+ subdomain_mails .append (subdomain_mail )
131
+ subdomain_social = sm_gather (subdomain_url )
132
+ subdomain_socials .append (subdomain_social )
133
+ except (socket .gaierror , requests .exceptions .SSLError , requests .exceptions .ConnectionError ):
134
+ print ('URL unreachable' )
135
+ pass
136
+
137
+ subdomain_ip = '' .join (subdomain_ip )
138
+ subdomain_mails = [sublist for sublist in subdomain_mails if sublist ]
139
+ subdomain_mails = [sublist for sublist in subdomain_mails if sublist != [None ]]
140
+ subdomain_mails = ', ' .join ([', ' .join (map (str , sublist )) for sublist in subdomain_mails ])
141
+ subdomain_socials = [{k : v for k , v in d .items () if v } for d in subdomain_socials ]
142
+ subdomain_socials = [d for d in subdomain_socials if d ]
143
+ subdomain_socials_grouped = defaultdict (list )
144
+
145
+ for d in subdomain_socials :
146
+ for key , value in d .items ():
147
+ subdomain_socials_grouped [key ].extend (value )
148
+
149
+ subdomain_socials_grouped = list (dict (subdomain_socials_grouped ).values ())
150
+
151
+ sd_socials = {'Facebook' : [], 'Twitter' : [], 'Instagram' : [], 'Telegram' : [], 'TikTok' : [], 'LinkedIn' : [],
152
+ 'VKontakte' : [], 'YouTube' : []}
153
+
154
+ for inner_list in subdomain_socials_grouped :
155
+ for link in inner_list :
156
+ if 'facebook.com' in link :
157
+ sd_socials ['Facebook' ].append (urllib .parse .unquote (link ))
158
+ elif 'twitter.com' in link :
159
+ sd_socials ['Twitter' ].append (urllib .parse .unquote (link ))
160
+ elif 'instagram.com' in link :
161
+ sd_socials ['Instagram' ].append (urllib .parse .unquote (link ))
162
+ elif 't.me' in link :
163
+ sd_socials ['Telegram' ].append (urllib .parse .unquote (link ))
164
+ elif 'tiktok.com' in link :
165
+ sd_socials ['TikTok' ].append (urllib .parse .unquote (link ))
166
+ elif 'linkedin.com' in link :
167
+ sd_socials ['LinkedIn' ].append (urllib .parse .unquote (link ))
168
+ elif 'vk.com' in link :
169
+ sd_socials ['VKontakte' ].append (urllib .parse .unquote (link ))
170
+ elif 'youtube.com' in link :
171
+ sd_socials ['YouTube' ].append (urllib .parse .unquote (link ))
172
+
173
+ return subdomain_mails , sd_socials , subdomain_ip
174
+
175
+
176
+ def preset (search_query , results , lang , start , timeout ):
177
+ """
178
+ Preset function for Google Dorking
179
+ """
180
+ ua = UserAgent ()
181
+ resp = get (
182
+ url = "https://www.google.com/search" ,
183
+ headers = {
184
+ "User-Agent" : ua .random
185
+ },
186
+ params = {
187
+ "q" : search_query ,
188
+ "num" : results + 2 ,
189
+ "hl" : lang ,
190
+ "start" : start ,
191
+ },
192
+ timeout = timeout ,
193
+ )
194
+
195
+ resp .raise_for_status ()
196
+ return resp
197
+
198
+ def dorking_processing (short_domain , num_results , lang = "en" , sleep_interval = 0 , timeout = 5 ):
199
+ """
200
+ Google Dorking automatization function
201
+ """
202
+ print ('Processing Google Dorking' )
203
+ search_queries = ['"{}" filetype:pdf OR filetype:xlsx OR filetype:docx OR filetype:PPT' .format (short_domain ),
204
+ '{} site:linkedin.com/in/' .format (short_domain ),
205
+ 'related: {}' .format (short_domain ),
206
+ 'info: {}' .format (short_domain )]
207
+ all_results = []
208
+ for search_query in search_queries :
209
+ start = 0
210
+ results = []
211
+ while start < num_results :
212
+ resp = preset (search_query , num_results - start ,
213
+ lang , start , timeout )
214
+
215
+ soup = BeautifulSoup (resp .text , "html.parser" )
216
+ result_block = soup .find_all ("div" , attrs = {"class" : "g" })
217
+ if len (result_block ) == 0 :
218
+ start += 1
219
+ for result in result_block :
220
+ link = result .find ("a" , href = True )
221
+ title = result .find ("h3" )
222
+ description_box = result .find (
223
+ "div" , {"style" : "-webkit-line-clamp:2" })
224
+ if description_box :
225
+ description = description_box .text
226
+ if link and title and description :
227
+ start += 1
228
+ results .append (urllib .parse .unquote (link ["href" ]))
229
+
230
+ sleep (sleep_interval )
231
+ all_results .append (results )
232
+
233
+ return ('' .join (f'</p>{ item } </p>' for item in all_results [0 ]), '' .join (f'{ item } </p>' for item in all_results [1 ]), '' .join (f'{ item } </p>' for item in all_results [2 ]),
234
+ '' .join (f'{ item } </p>' for item in all_results [3 ]))
0 commit comments