-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathscraper.py
79 lines (69 loc) · 3.43 KB
/
scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
from bs4 import BeautifulSoup
import requests
import json
from utils import account_id_to_steam_id, text_between, resource_path
from selenium import webdriver
import time
class Scraper:
def __init__(self, config_path="./config.json"):
self.comment_urls, self.thread_urls = self.__getGroupURLs(resource_path(config_path))
@staticmethod
def __getGroupURLs(config_path):
with open(config_path) as file:
data = json.load(file)
return data['groups']['urls']['autotrader']['comments'], data['groups']['urls']['autotrader']['threads']
def getTradeURLsComments(self):
res = {}
for url in self.comment_urls:
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")
comments = soup.find_all("div", class_="commentthread_comment_content")
for comment in comments:
profile_link = comment.find("a", class_="commentthread_author_link")["href"]
trade_link = comment.find("a", class_="bb_link")
if trade_link == None or trade_link.find("https://steamcommunity.com") == -1:
continue
split_sub = "profiles/" if profile_link.find("profiles") != -1 else "id/"
steam_id = None
if trade_link['href'].find('?partner=') == -1:
continue
elif split_sub == "id/": #Extract id from trade link
try:
steam_id = account_id_to_steam_id(text_between(trade_link['href'], '?partner=', '&'))
except:
continue
else:
steam_id = profile_link.split(split_sub)[1]
#steam_id = steam_id if split_sub == "profiles/" else account_id_to_steam_id(steam_id)
res[steam_id] = trade_link["href"]
return res
def getTradeURLsThreads(self):
res = {}
for url in self.thread_urls:
req = requests.get(url)
soup = BeautifulSoup(req.content, "html.parser")
discussions_url = soup.find_all("a", class_="forum_topic_overlay")
for disc in discussions_url:
req = requests.get(disc["href"])
soup = BeautifulSoup(req.content, "html.parser")
if soup != None:
trade_info = soup.find("div", class_='forum_op')
if trade_info:
trade_info = trade_info.find('div', class_="content")
if trade_info:
trade_link = trade_info.find('a', class_='bb_link')
if trade_link and trade_link["href"].find('https://steamcommunity.com/tradeoffer/') != -1:
#print(trade_link["href"])
try:
steam_id = account_id_to_steam_id(text_between(trade_link["href"], '?partner=', '&'))
res[steam_id] = trade_link["href"]
except:
continue
return res
def __get_trade_urls_CS_main_trade_discussion(self, url, soup):
driver = webdriver.PhantomJS()
driver.get(url)
time.sleep(10)
p_element = driver.find_element_by_class_name("maincontent")
print(p_element.get_attribute('innerHTML'))
return None, None