-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawling.py
70 lines (54 loc) · 2.31 KB
/
crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import requests
from bs4 import BeautifulSoup
import numpy as np
import random
import time
#####################################
############# CROWLING ##############
#####################################
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'}
def add_ids(page_num, user_ids: list):
data = requests.get(f'https://www.acmicpc.net/ranklist/{page_num}', headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
trs = soup.select('tbody > tr')
user_len = len(trs) # 최대 100명 임
sample_len = 5
if user_len < sample_len :
sample_len = user_len
sample_nums = random.sample(range(0,user_len),sample_len)
for num in sample_nums:
tr = trs[num]
user_ids.append(tr.select_one('td:nth-child(2) > a').text)
def add_group_ids(group_num, page_num, user_ids: list):
data = requests.get(f'https://www.acmicpc.net/school/ranklist/{group_num}/{page_num}', headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
trs = soup.select('tbody > tr')
for tr in trs:
user_ids.append(tr.select_one('td:nth-child(2) > a').text)
def gen_user_problem_mat(id, user_problem: dict, problem_num_set: set):
data = requests.get(f'https://www.acmicpc.net/user/{id}', headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
trs = soup.select('div.problem-list')
time.sleep(0.1)
user_problem[id] = []
for tr in trs:
problem_nums = tr.select('a')
for problem_num in problem_nums:
problem_num = int(problem_num.text)
# print(problem_num)
user_problem[id].append(problem_num)
problem_num_set.add(problem_num)
def add_to_user_problem_mat(idx, id, user_problem_mat: np.array):
data = requests.get(f'https://www.acmicpc.net/user/{id}', headers=headers)
soup = BeautifulSoup(data.text, 'html.parser')
trs = soup.select('div.problem-list')
for tr in trs:
problem_nums = tr.select('a')
for problem_num in problem_nums:
problem_num = int(problem_num.text) - 1000
# print(problem_num)
try:
user_problem_mat[idx, problem_num] = 1
except:
print("문제 번호 : " + str(problem_num))