-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetBuildsScript.py
241 lines (204 loc) · 9.48 KB
/
getBuildsScript.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
import json
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException, WebDriverException
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
# options.add_argument(
# "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
# )
driver = webdriver.Chrome(options=options)
def create_jsonl_file(
price, name, description, cpu, gpu, ram, mobo, psu, storage, case
):
with open("output.jsonl", "a") as file:
data = {
"text": f"<s>[INST] <<SYS>>\nYou are an AI focused on generating lists of PC components based on given user requirements. Your task is to provide a list of recommended parts in a specific order followed by explanations for each choice. Your responses should be clear, concise, and informative. Do not engage in regular conversation with the user. Always ensure that your recommendations are based on the information provided and are well-reasoned.\n<</SYS>>\n\n {price} Build: {name}, {description} [/INST] [<CPU> {cpu}; <GPU> {gpu}; <RAM> {ram}; <MOBO> {mobo}; <PSU> {psu}; <STORAGE> {storage}; <CASE> {case}]"
}
file.write(json.dumps(data) + "\n")
def getBuild(urlInput):
# go to google.com
## wait 2 seconds
print("getBuild")
print(urlInput)
# Navigate to the page
time.sleep(2)
driver.get(urlInput)
driver.add_cookie(
{
"name": "xcsrftoken",
"value": "4QtJn4lfmjYdGgtEP9bsyEUsazTcEpMjYZRn8TjrLtpJvosG0kXj1b7YQN1iX7PO",
}
)
# __cf_bm=POuRd8BShc4HyK2HATP4JDu2nexzCVa5NeuyIVGzyTs-1698561406-0-AQnTKRuiZkG4OgaXJ7Wbt1RbuBsGWzRx3sneFLfGhyFlBqCIv8nVua3hGCu3ll8NbivGB6cYU9L/vVhVrggSe94=; cf_clearance=4E5S6P6_Ei7TRTdwQHg0wYzJMnHlO8VvSXAjxayUgbY-1698561407-0-1-217ba70.4b09a98d.912d3306-0.2.1698561407; xsessionid=6mcd0kzue994cdoygqbvwal2rrvngx53; xcsrftoken=4QtJn4lfmjYdGgtEP9bsyEUsazTcEpMjYZRn8TjrLtpJvosG0kXj1b7YQN1iX7PO
driver.add_cookie(
{
"name": "__cf_bm",
"value": "CE_hQrISQEvMIg4VSPVyQjvaFAzewB1kGNDMIGePEvM-1698565788-0-AZBlI5vQc4wWlFbYEzHRPwWdYV2zaI4Y/igCgMQrypy8chKBSEERmDpbTcnpcHeLpbeC0VM6zqD6+nuahhR1tz4=",
}
)
driver.add_cookie(
{
"name": "cf_clearance",
"value": "4E5S6P6_Ei7TRTdwQHg0wYzJMnHlO8VvSXAjxayUgbY-1698561407-0-1-217ba70.4b09a98d.912d3306-0.2.1698561407",
}
)
driver.add_cookie(
{"name": "xsessionid", "value": "6mcd0kzue994cdoygqbvwal2rrvngx53"}
)
time.sleep(2)
wait = WebDriverWait(driver, 30) # wait for up to 20 seconds
try:
# Wait for the build name to appear and then scrape it
build_name = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, "pageTitle"))
).text
# Wait for the build description to appear and then scrape it
description = wait.until(
EC.presence_of_element_located((By.CLASS_NAME, "markdown"))
).text
# Wait for the build price to appear and then scrape it
total_price_element = wait.until(
EC.presence_of_element_located((By.CSS_SELECTOR, "td.td__price"))
)
total_price = total_price_element.text
# Scrape the list of parts
parts_elements = wait.until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "td__component"))
)
parts_nameAndPrice = wait.until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "td__name"))
)
print([nap.text for nap in parts_nameAndPrice])
# print(parts_nameAndPrice)
partsZIP = zip(parts_elements, parts_nameAndPrice)
return
## if it doesn't have "CPU", "Video Card", "Memory", "Motherboard", "Storage", "Power Supply", "Case" in it, then just skip it
# if not any(
# element[0].text
# in (
# "CPU",
# "Video Card",
# "Memory",
# "Motherboard",
# "Storage",
# "Power Supply",
# "Case",
# )
# for element in partsZIP
# ):
# ##print the element[0].text
# return
##dictionary comprehension of parts, i.e. CPU: Intel i9
partsDictionary = {
element[0].text: element[1].text.split("\n")[0]
for element in partsZIP
if element[0].text
in (
(
"CPU",
"Video Card",
"Memory",
"Motherboard",
"Storage",
"Power Supply",
"Case",
)
)
and len(element[1].text.split("\n")) == 2
}
print(partsDictionary)
# for element in partsZIP:
# print(element[1].text)
# break
# parts_list = [(element[0].text, element[1].text.split('\n')[0], element[1].text.split('\n')[1]) for element
# in
# partsZIP if element[0].text in (
# ("CPU", "Video Card", "Memory", "Motherboard", "Storage", "Power Supply", "Case")) and len(element[1].text.split('\n')) == 2]
## get cpu from parts ZIP, i.e. if element[0].text == "CPU". Find CPU name in parts
CPU = partsDictionary["CPU"]
print(CPU)
GPU = partsDictionary["Video Card"]
print(GPU)
RAM = partsDictionary["Memory"]
print(RAM)
MOBO = partsDictionary["Motherboard"]
print(MOBO)
PSU = partsDictionary["Power Supply"]
print(PSU)
STORAGE = partsDictionary["Storage"]
print(STORAGE)
CASE = partsDictionary["Case"]
print(CASE)
# Print the scraped details
print("Build Name:", build_name)
print("\nDescription:\n", description)
print("\nPrice:", total_price)
print("\nParts List:")
# for part in parts_list:
# print("\t", part[0] + ": ", part[1], part[2])
print("\t", "CPU: ", CPU)
print("\t", "GPU: ", GPU)
print("\t", "RAM: ", RAM)
print("\t", "MOBO: ", MOBO)
print("\t", "PSU: ", PSU)
print("\t", "STORAGE: ", STORAGE)
print("\t", "CASE: ", CASE)
## add to file
# create_jsonl_file("High-end", "Gaming Beast", "Built for intensive gaming sessions", "Intel i9", "Nvidia RTX 3090", "32GB DDR4", "ASUS ROG", "750W", "1TB SSD + 2TB HDD", "Cooler Master H500P")
# create_jsonl_file(total_price, build_name, description, parts_list[0][1], parts_list[1][1], parts_list[2][1], parts_list[3][1], parts_list[4][1], parts_list[5][1], parts_list[6][1])
f.close()
except TimeoutException:
print(f"Timed out waiting for elements to appear on {urlInput}!")
def getAllBuilds():
# loop through all builds
for i in range(1, 10):
try:
## start url
urlTemp = (
"https://pcpartpicker.com/builds/#g=538,541,540,379,380,378,373,369,415,367,390,525,443,500,476,439,450,438,186,185,224,130,165,221,173,436,514,446,425,447,427,448,424,518,499,546,497,513,494,508,506,492,507,516,505,493,520,552,553,550,549,542,539,478,484,445,444,521,517,511,509,522,526,501,523,495,496,498,524,554,558,559,547,548,404,405,527,510,529,530,504,535,435,417,375,393,402,403&s=33,41,14,15,39,37,16,40,21,28,35&X=25000,550000&G=1&page="
+ str(i)
)
driver.get(urlTemp)
# Wait for the build links to appear
wait = WebDriverWait(driver, 30) # wait for up to 30 seconds
build_links = wait.until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "logGroup__target"))
)
print(build_links[0].get_attribute("outerHTML"))
urls = [link.get_attribute("href") for link in build_links]
print(urls)
# Save urls to file
with open("urls.txt", "a") as f:
for url in urls:
f.write(url + "\n")
except TimeoutException:
print(f"Timed out waiting for build links to appear on page {i}!")
except WebDriverException as e:
print(f"WebDriverException occurred on page {i}: {e}")
except Exception as e:
print(f"An error occurred on page {i}: {e}")
# getAllBuilds()
# read in urls from file
urls = []
with open("urls.txt", "r") as f:
urls = f.readlines()
# close
f.close()
# Iterate through each URL and scrape data
for url in urls:
# Start a new browser session
driver = webdriver.Chrome()
getBuild(url) # This is your scraping function
driver.quit() # Close the browser session
# getBuild("https://pcpartpicker.com/b/9MK323")
# create_jsonl_file("High-end", "Gaming Beast", "Built for intensive gaming sessions", "Intel i9", "Nvidia RTX 3090", "32GB DDR4", "ASUS ROG", "750W", "1TB SSD + 2TB HDD", "Cooler Master H500P")
# create_jsonl_file("HIIIIIIIII", "UR MOMMMMMMM", "Built for intensive gaming sessions", "PLEASEEEEEE", "HELPPPPP", "32GB DDR4", "ASUS ROG", "SAVE MEEEEEE", "1TB SSD + 2TB HDD", "PLEASE")