-
Notifications
You must be signed in to change notification settings - Fork 0
/
ai_crawling.py
150 lines (120 loc) · 4.33 KB
/
ai_crawling.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
import bs4.element
import requests
from bs4 import BeautifulSoup
from datetime import date
from datetime import datetime
from sqlalchemy import create_engine, Table, MetaData, ForeignKey, Text
from sqlalchemy.orm import sessionmaker, declarative_base, Mapped, mapped_column
from sqlalchemy import Column, Integer, CHAR, ARRAY, DateTime
import sqlalchemy
import dev_db
AI_BASE_URL = "http://aix.ssu.ac.kr/"
Base = declarative_base()
db_url = sqlalchemy.engine.URL.create(
drivername="postgresql+psycopg2",
username=dev_db.dev_user_name,
password=dev_db.dev_db_pw,
host=dev_db.dev_host,
database=dev_db.dev_db_name,
)
engine = create_engine(db_url)
session_maker = sessionmaker(autoflush=False, autocommit=False, bind=engine)
metadata_obj = MetaData()
class AiNotification(Base):
__tablename__ = "notice"
__table_args__ = {"schema": "notice"}
id = Column(Integer, primary_key=True)
title = Column(CHAR(1024))
department_id = Column(Integer)
content = Column(CHAR(2048))
category = Column(CHAR(32))
image_url = Column(ARRAY(CHAR(2048)))
file_url = Column(ARRAY(CHAR(2048)))
created_at = Column(DateTime)
updated_at = Column(DateTime)
def __init__(self, row: bs4.element.Tag):
childrens = row.find_all("td")
if childrens:
href = childrens[0].find("a")["href"]
self.__link = AI_BASE_URL + href
else:
return
req = requests.get(self.__link)
soup = BeautifulSoup(req.text, "lxml")
contents = soup.find("table", class_="table").find_all("p")
# 제목
self.title = childrens[0].text.strip()
# 내용
self.content = ""
for content in contents:
self.content += content.text
# if (
# len(self.content.encode("utf-8")) + len(self.content.encode("utf-8"))
# > 2048
# ):
# print("content size extend")
# break
# 카테고리
self.category = "AI융합학부"
# 이미지
self.image_url = []
# 파일
contents = soup.find("table", class_="table").find("a")["href"]
print(contents)
# file_container = content[3].find(class_="file")
# if file_container is not None:
# files = file_container.findAll("a")
# files = None
# file_link = []
# if files is not None:
# for file in files:
# link = AI_BASE_URL + file["href"]
# file_link.append(link)
# self.file_url = file_link
self.file_url = []
# 생성 시각
created_date = list(map(int, childrens[2].text.split(".")))
self.created_at = date(created_date[0], created_date[1], created_date[2])
# 업데이트 시각
self.updated_at = datetime.now().strftime("%Y-%m-%d")
with engine.connect() as connect:
department_table = Table(
"department", metadata_obj, schema="main", autoload_with=engine
)
# department_table에 ai융합학부가 없는 듯해서 일단 컴퓨터학부로 했습니다.
query = department_table.select().where(department_table.c.name == "컴퓨터학부")
results = connect.execute(query)
for result in results:
self.department_id = result.id
def __str__(self):
return (
"title: {0}\n"
"content: {1}\n"
"image_url: {2}\n"
"file_url: {3}\n"
"department_id: {4}".format(
self.title,
self.content,
self.image_url,
self.file_url,
self.department_id,
)
)
def ai_department_crawling(value):
page = 1
url = AI_BASE_URL + "notice.html?searchKey=ai"
req = requests.get(url)
soup = BeautifulSoup(req.text, "lxml")
content = soup.find("table", class_="table")
rows = content.find_all("tr")
results = []
for row in rows[1:4]:
results.append(AiNotification(row))
with session_maker() as session:
for result in results:
session.add(result)
# print(result) # db 삽입 내용 확인 출력문
session.commit()
def departments_crawling(value):
ai_department_crawling(value)
departments_crawling(1)