-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdorkasaurus.py.bak
141 lines (112 loc) · 5.42 KB
/
dorkasaurus.py.bak
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# trying to change the parse cause version 2 will no longer pass the html to the soup properly when using selenium
########################
# to do =
# check for cockblock
# click the link to re-do search and show omitted results
# add to the regex find of the string...
# if regex:
# find element by text'repeat the search with the omitted results included'.click()
from selenium import webdriver
from selenium.webdriver import ActionChains
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
import mechanize
import urllib
from bs4 import BeautifulSoup
import re
from time import sleep
import random
from pymongo import MongoClient
import sys
# db connection
client = MongoClient('mongodb://127.0.0.1:27017')
db = client.search_engine_scraper # db name
cursor = db.search_results # collection name
mongod_path = 'C:\Users\PK\Desktop\VM_Shared_Folder\mongodata'
# this string shows up on the last page of results
regex_string = re.compile(
r'(In\sorder\sto\sshow\syou\sthe\smost\srelevant\sresults,\swe\shave\somitted\ssome\sentries\svery\ssimilar\sto\sthe\s)(\d+)')
# creates the browser object
# br = mechanize.Browser()
# br.addheaders = [('User-agent', 'Mozilla/5.0')]
# br.set_handle_robots(False)
binary = r'C:\Program Files (x86)\Mozilla Firefox\firefox.exe' # selenium browser driver bin location
fp = webdriver.FirefoxProfile() # firefox profile
driver = webdriver.Firefox(firefox_binary=binary, firefox_profile=fp,
executable_path=r'C:\Users\PK\Desktop\VM_Shared_Folder\01Jan\instagram\geckodriver.exe')
# open browser in a non google property
random_start_url = ['https://twitter.com', 'http://yahoo.com', 'http://facebook.com']
random_start_url = random.choice(random_start_url)
driver.get(random_start_url)
# leftover from trying to set prefs to 100, give goog a chance to set cookies
google_url = 'https://google.com'
driver.get(google_url)
driver.refresh()
# base search url ,ready for search term
base_url = 'http://google.com/search?q='
dork_list = ['filetype:xls', 'filetype:pdf']
with open(r'C:\Users\PK\Desktop\VM_Shared_Folder\01Jan\dorkasaurus\dork_list.txt') as f:
dorks = f.readlines()
target_domain_list = ['directv.com']
with open(r'C:\Users\PK\Desktop\VM_Shared_Folder\01Jan\dorkasaurus\domain_list.txt') as f:
domains = f.readlines()
# search term, url encoded, now ready to append to base_url
raw_search_term = "site:directv.com filetype:pdf"
search_term = urllib.quote_plus(raw_search_term)
hundred_results = '&num=100' # add to end of search request
page_increment = 100 # helps move to next page, if below try/except works we set this to 100
# base_url + search_term, pass to browser GET request
full_url = base_url + str(search_term) + hundred_results
page = 1 # search results page number
result_number = 0 # set to 0 instead of 1 cause it's always off by 1, dont know why
start_results_at = 0 # used to go to next page of results
regex_match = False # Prime the while loop below to run
newList = [] # create list to add dicts of search results to
###########################################
# start while loop
###########################################
while not regex_match: # keep running until we see the regex match = True
driver.get(full_url) # conduct the search
# print 'Sleeping'
# sleep(10)
html = driver.page_source # get page source
# html = unicode(html, errors='ignore') #skip??
soup = BeautifulSoup(html, 'html.parser')
main_div = soup.find_all('div', {'class': 'g'})
nav = soup.find_all('div', id='foot')
for i in main_div:
try:
# print 'Page: ', str(page)
# print 'Number: ',str(result_number)
# title = i.find('h3', {'class': 'r'})
# print 'Title: ',title.text
link = i.find('h3', {'class': 'r'}).a['href']
linkDict = {'searchTerm': raw_search_term,
'scraped_link': link,
'scraped': 0}
newList.append(linkDict)
print link
result_number += 1
except:
continue
print 'Page: ', str(page) # for logging, page we're on
print str(result_number), 'results:' # for logging, # of results
print 'newList pre: ', newList # just to see, what we're inserting into db from this search result page
print
posts = cursor.insert_many(newList) # insert into db
newList = [] # re-initialize an empty list ready for next search page's results
returned_url = driver.current_url
print 'returned url: ', returned_url
regex_search = re.findall(regex_string,
html) # search html source for 'we limited results' string. indicates last page
if regex_search:
regex_match = True # if we find that string we set the loop to True, so it will stop
print 'Found Most relevant Results text, shutting down'
start_results_at += page_increment # preparing to append this to search string to GET the next N results. needs to be tied to
# whether we set results to 100 or 10
full_url = base_url + str(search_term) + '&num=' + str(page_increment) + '&start=' + str(
start_results_at) # prep to get next page of search results
page += 1 # for logging, let's us know what page of results we're on
sleep_time = random.randint(20, 60) # sleep to be kind to Google
print 'Sleeping ' + str(sleep_time) + ' seconds'
sleep(sleep_time)