-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_parser.py
77 lines (58 loc) · 2.46 KB
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
'''Emily Hopkins adapted to DDMAL needs from the parser Evan Savage
wrote for the SIMSSA site'''
# Later adapted by Taz Scott-Talib to work with static website (July 2023)
from bs4 import BeautifulSoup
from urllib.parse import unquote
import json
print('Media (m,M), presentations (pr, PR), publications (pu, PU) or all (a,A)?\n')
choice = str(input()).lower()
input_list = ['m', 'pr', 'pu', 'a']
full_list = ['media', 'presentations', 'publications']
parse_list = []
if choice not in input_list:
print('\nTry again, the input was not valid.\n\n')
exit()
if choice == 'a':
parse_list = full_list
else:
parse_list = [full_list[input_list.index(choice)]]
ddmal_root_folder = './'
export_folder = 'zotero_export/'
for type in parse_list:
html_file_name = f'SIMSSA_{type}.html'
path = f'activities/{type}/content.json'
# Dictionaries for each of the different sources. Keys are the years, values are the html contents.
# These will be stored in JSON files in the corresponding folders.
content = {}
with open(export_folder + html_file_name, encoding='utf-8') as f:
html_soup = BeautifulSoup(f, 'html.parser')
html_array = []
for html_tag in html_soup.findAll('div', {'class': 'csl-entry'}):
parse_attr = html_tag.find_next('span')['title']
year = 'n.d.'
author = 'no_author'
title = ')no_title'
a_title = ')no_a_title'
b_title = ')no_b_title'
if 'rft.date' in parse_attr:
year = parse_attr.split('rft.date=')[1].split('-')[0].split('&')[0]
# might need later
# if 'rft.aulast' in parse_attr:
# author = unquote(parse_attr.split('rft.aulast=')[1].split('&')[0])
# if 'rft.title' in parse_attr:
# title = unquote(parse_attr.split('rft.title=')[1].split('&')[0])
# if 'rft.atitle' in parse_attr:
# a_title = unquote(parse_attr.split('rft.atitle=')[1].split('&')[0])
# if 'rft.btitle' in parse_attr:
# b_title = unquote(parse_attr.split('rft.btitle=')[1].split('&')[0])
if year in content:
content[year].append(html_tag.decode_contents())
else:
content[year] = [html_tag.decode_contents()]
# sort by year, descending
content = {y: content[y] for y in sorted(content, reverse=True)}
# sort alphabetically in each year
for y in content:
content[y].sort()
with open(path, 'w') as f:
json.dump(content, f, indent=4)