-
Notifications
You must be signed in to change notification settings - Fork 109
/
Copy pathCloudScraper.py
197 lines (157 loc) · 6.6 KB
/
CloudScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
from argparse import ArgumentParser
from multiprocessing import Pool
from termcolor import colored
from rfc3987 import parse
import itertools
import requests
import urllib3
import sys
import re
def print_banner():
print('''\nCloudScraper is a tool to search through the source code of websites in order to find cloud resources belonging to a target.
by Jordan Potti
@ok_bye_now\n'''
)
def checker(url):
'''
Check if the url is a valid one or not.
'''
try:
parse(url)
return True
except ValueError:
return False
return False
def gather_links(html):
'''
Apply to the raw HTML a regular expression to gather all the urls.
'''
urls = []
links_ = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\), ]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', html)
urls.extend(filter(checker, links_)) #filter the ones that don't compile with the checker function
del(links_)
return list(set(urls))
def start(target):
'''
Load the initial url and gather the first urls that will be used
by the spider to keep looking for more links
'''
print(colored("Beginning search for cloud resources in {}".format(target), color='cyan'))
try:
html = requests.get(target, allow_redirects=True, headers=headers, verify=arguments.no_verify).text
links = gather_links(html)
except requests.exceptions.RequestException as e:
if arguments.v:
print(colored('Network error: {}'.format(e), 'red', attrs=['bold']))
return
print(colored('Initial links: {}\n'.format(len(links)), color='cyan'))
spider(links, target)
def worker(url):
'''
Function handling all the crawling action of the spider.
It first checks the desired depth and if the domain of
the url matches the target to avoid crawling other web sites.
Makes a GET request, parses the HTML and returns all the links.
'''
if url.count("/") <= arguments.depth+2:
try:
html = requests.get(url, allow_redirects=True, headers=headers, verify=arguments.no_verify).text
links = gather_links(html)
except requests.exceptions.RequestException as e:
if arguments.v:
print(colored('Network error: {}'.format(e), 'red', attrs=['bold']))
return []
print('{} links found [{}]'.format(len(links), url))
return links
else:
return []
def spider(base_urls, target):
'''
Loop through the initial links found in the given page. Each new link
discovered will be added to the list if it's not already there, and thus
crawled aswell looking for more links.
wannabe list works as the placeholder for the urls that are yet to crawl.
base_urls is a list with all the already crawled urls.
'''
global target_
target_ = parse(target)
p = Pool(arguments.process)
wannabe = [url for url in base_urls if target_['authority'] in parse(url)['authority']]
while True:
#retrieve all the urls returned by the workers
new_urls = p.map(worker, wannabe)
#flatten them and remove repeated ones
new_urls = list(set(itertools.chain(*new_urls)))
wannabe = []
i = 0
#if new_urls is empty meaning no more urls are being discovered, exit the loop
if new_urls == []:
break
else:
for url in new_urls:
if url not in base_urls:
'''
For each new url, check if it hasn't been crawled. If it's
indeed new and contains the target domain it gets appended to
the wannabe list so in the next iteration it will be crawled.
'''
i += 1
if target_['authority'] in parse(url)['authority']:
wannabe.append(url)
base_urls.append(url)
print(colored('\nNew urls appended: {}\n'.format(i), 'green', attrs=['bold']))
p.close()
p.join()
#once all the links for the given depth have been analyzed, execute the parser
parser(base_urls)
def parser(links):
'''
Once all the links have been gathered check how many of them
match with the list of cloud domains we are interested in.
'''
print(colored('Parsing results...', 'cyan', attrs=['bold']))
cloud_domains = ['amazonaws.com', 'digitaloceanspaces.com', 'windows.net', 'storage.googleapis.com', 'aliyuncs.com']
matches = []
[[matches.append(link) for link in links if cloud_domain in link] for cloud_domain in cloud_domains]
matches = list(set(matches))
print('\nTotal links: ', len(links))
if len(matches) == 0:
print(colored("There were no matches!", 'red', attrs=['bold']))
else:
print(colored("There were {} matches for this search!".format(len(matches)), 'green', attrs=['bold']))
[print(match, "\n") for match in matches]
def args():
parser = ArgumentParser()
parser.add_argument("-u", dest="URL", required=False, help="Target Scope")
parser.add_argument("-d", dest="depth", type=int, required=False, default=5, help="Max Depth of links Default: 5")
parser.add_argument("-l", dest="targetlist", required=False, help="Location of text file of Line Delimited targets")
parser.add_argument("-v", action="store_true", default=False, required=False, help="Verbose output")
parser.add_argument("-p", dest="process", required=False, default=2, type=int, help="Number of processes to run")
parser.add_argument("--no-verify", action="store_false", default=True, required=False, help="Skip TLS verification")
if len(sys.argv) == 1:
parser.error("No arguments given.")
parser.print_usage
sys.exit()
#ouput parsed arguments into a usable object
return parser.parse_args()
def cleaner(url):
if 'http' not in url:
return ("https://"+url).strip()
else:
return url.strip()
def main():
if arguments.targetlist:
with open (arguments.targetlist, 'r') as target_list:
[start(cleaner(line)) for line in target_list]
else:
start(cleaner(arguments.URL))
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.82 Safari/537.36'
}
arguments = args()
# If we passed --no-verify then we likely don't care about insecure request warnings.
if arguments.no_verify:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
if __name__ == '__main__':
print_banner()
main()