forked from DDMAL/ddmal.github.io
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_parser.py
88 lines (72 loc) · 3.08 KB
/
html_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
'''Emily Hopkins adapted to DDMAL needs from the parser Evan Savage
wrote for the SIMSSA site'''
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
import os
import markdown
import shutil
print('Media (m,M), presentations (pr, PR), publications (pu, PU), posters (po, PO), or all (a,A)?\n')
choice = str(input()).lower()
input_list = ['m', 'pr', 'pu', 'po', 'a']
full_list = ['media', 'presentations', 'publications', 'posters']
parse_list = []
if choice not in input_list:
print('\nTry again, the input was not valid.\n\n')
exit()
if choice == 'a':
parse_list = full_list
else:
parse_list = [full_list[input_list.index(choice)]]
ddmal_root_folder = './'
export_folder = 'zotero_export/'
# if os.path.exists():
# shutil.rmtree(simssa_root_folder + citation_folder + '/' + year)
for type in parse_list:
html_file_name = f'DDMAL_{type}.html'
citation_folder = f'_{type}'
with open(export_folder + html_file_name) as f:
html_soup = BeautifulSoup(f, 'html.parser')
shutil.rmtree(citation_folder)
os.makedirs(citation_folder)
# Save html (div) and ascii title [ [<div></div>, "Example Title"]]
html_array = []
for html_tag in html_soup.findAll('div', {'class': 'csl-entry'}):
# print(tag.find_next('span'))
# if html_tag.find('a'):
# continue
parse_attr = html_tag.find_next('span')['title']
year = 'n.d.'
author = 'no_author'
title = ')no_title'
a_title = ')no_a_title'
b_title = ')no_b_title'
if 'rft.date' in parse_attr:
year = parse_attr.split('rft.date=')[1].split('-')[0].split('&')[0]
if 'rft.aulast' in parse_attr:
author = unquote(parse_attr.split('rft.aulast=')[1].split('&')[0])
if 'rft.title' in parse_attr:
title = unquote(parse_attr.split('rft.title=')[1].split('&')[0])
if 'rft.atitle' in parse_attr:
a_title = unquote(parse_attr.split('rft.atitle=')[1].split('&')[0])
if 'rft.btitle' in parse_attr:
b_title = unquote(parse_attr.split('rft.btitle=')[1].split('&')[0])
final_title = ''
for t in [title, a_title, b_title]:
if t.split('_')[0] != ')no':
final_title = t
break
final_title = final_title.replace('/', ' ')
file_name = author + '_' + final_title.replace(' ', '_') + '_' + year + '.md'
if not os.path.exists(ddmal_root_folder + citation_folder + '/' + year):
os.makedirs(ddmal_root_folder + citation_folder + '/' + year)
with open(ddmal_root_folder + citation_folder + '/' + year + '/' + file_name, 'w') as f:
f.write(f'---\npresentation_year: {year}\nyear: {year}\n---\n\n{html_tag.decode_contents()}')
print(html_tag.decode_contents(), '\n')
print(parse_attr, '\n')
print('T', final_title, '\n\n')
# print("unsorted")
# for x in html_array: print(x[0], x[1])
html_array = sorted(html_array, key = lambda x: (x[0], x[1]))
# print("\nsorted")
# for x in html_array: print(x[0], x[1], x[2], "\n")