-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
executable file
·67 lines (53 loc) · 2.14 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os
from argparse import ArgumentParser
from datetime import datetime
from weasyprint import HTML
from requests_html import HTMLSession
SESSION = HTMLSession()
BASE_URL = "https://www.gktoday.in/current-affairs/category/{category}/page/{page_num}"
def get_page(url: str):
return SESSION.get(url)
def parse_date(date: str):
date = date.lstrip("Published: ")
time_obj = datetime.strptime(date, "%B %d, %Y")
return time_obj
def save_pdf(html_string, name, dated, category):
article_date = parse_date(dated).strftime("%Y%m%d")
path = "articles/{}/{}".format(category, article_date)
if not os.path.exists(path):
os.makedirs(path)
file_name = "{}/{}-{}.pdf".format(path, article_date, name.replace("/", "-"))
HTML(string=html_string).write_pdf(file_name)
print("written file", file_name)
def process_category(category: str, pages: int, page_start: int = 1):
temp_args = {"category": category, "page_num": 1}
for i in range(page_start, pages + 1):
temp_args["page_num"] = i
category_page = get_page(BASE_URL.format(**temp_args))
titles = category_page.html.find("div.posts-listing > h1 > a")
for title in titles:
link = title.links.pop()
article_page = get_page(link)
article = article_page.html.find("div.inside_post", first=True)
article_date = article.find("div.postmeta-primary > p", first=True).text
save_pdf(article.html, title.text, article_date, category)
print("processed page ", i)
if __name__ == "__main__":
parser = ArgumentParser(
description="Parse and save as PDF range of pages from gktoday.in for category",
)
parser.add_argument(
"category",
help="Provide category for the website such as `environment-current-affairs`",
)
parser.add_argument("page_num", help="Page until which to parse", type=int)
parser.add_argument(
"--start",
"-s",
help="Start parsing from this page",
type=int,
default=1,
)
args = parser.parse_args()
process_category(args.category, args.page_num, args.start)