-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCrawling_Seoul Natinal University Hospital.py
97 lines (67 loc) · 2.59 KB
/
Crawling_Seoul Natinal University Hospital.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import requests
from bs4 import BeautifulSoup
import json
def get_link_list_per_page(num):
url = f"https://terms.naver.com/list.naver?cid=51007&categoryId=51007&page={num}"
# pg 1 ~ 118까지 있음
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
links = soup.select("ul > li > div.info_area > div.subject > strong > a:nth-child(1)")
linklist = []
for link in links:
href = link.attrs['href']
linklist.append("http://terms.naver.com"+href)
return linklist
invalid_links = []
def get_elements(url):
# 한 개의 페이지에 대한 요소를 크롤링하여 dictionary로 저장하는 함수
temp_dict = {}
response = requests.get(url)
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# 질병명
title = soup.select_one("div.section_wrap > div.headword_title > h2").get_text()
title_en = soup.select_one("div.section_wrap > div.headword_title > p.word > span").get_text()
temp_dict['용어'] = title
temp_dict['영문용어'] = title_en
# 요약
summary = soup.select_one("div.section_wrap > #size_ct > dl").get_text()
summary = summary.split(' ', 1)[1]
summary = summary.rstrip()
temp_dict['요약'] = summary
# 목차
agendas = soup.select("#size_ct > div.tmp_agenda > ol > li")
agendalist = []
for agenda in agendas:
text = agenda.get_text()
agendalist.append(text)
# 내용
contents = soup.select("#size_ct > p")
contentlist = []
for content in contents:
text = content.get_text()
contentlist.append(text)
if len(agendalist) != len(contentlist):
invalid_links.append(url)
else:
for agenda, content in zip(agendalist, contentlist):
temp_dict[agenda] = content
print(temp_dict)
return temp_dict
# 크롤링
medic_info = []
for i in range(1, 119):
urls = get_link_list_per_page(i)
for url in urls:
medic_info.append(get_elements(url))
with open('medic_info.json','w', encoding='utf-8') as file:
json.dump(medic_info, file, ensure_ascii=False, indent='\t')
with open('invalid_links.json','w', encoding='utf-8') as file:
json.dump(invalid_links, file, ensure_ascii=False, indent='\t')
print(invalid_links)
'''
print(agendatype)
print(set(agendatype))
#{'식이요법/생활가이드', '관련질병', '예방방법', '원인', '치료', '증상', '진단/검사', '정의', '경과/합병증', '하위질병'}
'''