-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
70 lines (56 loc) · 2.05 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
"""
A simple script that retrieve data from hugo.team site
and store them keeping its format by using bold (**) for <strong> tag,
italics (_) for <em> tag and line breaks (\n) for <br/> tag
Jijel in 27 Jumada al Alkhir(6) 1440
04 March 2019
By Oussama Heloulou
"""
from bs4 import BeautifulSoup
import requests
import re
import json
BASE_URL = 'https://www.hugo.team'
TEMPLATES_PAGE = 'https://www.hugo.team/meeting-note-templates'
OUTPUT = list()
def get_template_urls(templates_page):
source = requests.get(templates_page)
source.encoding = 'utf-8'
soup = BeautifulSoup(source.text, 'html.parser')
anchor_tags = soup.find_all('a', {'class': 'link-block-29'})[:21]
template_urls = []
for anchor in anchor_tags:
url = BASE_URL + anchor['href']
template_urls.append(url)
return template_urls
def get_template_data(template_url):
source = requests.get(template_url)
source.encoding = 'utf-8'
soup = BeautifulSoup(source.text, 'html.parser')
title = soup.find('div', {'class': 'div-block-594'}).text
title = re.sub(r'(Template)', r" \1", title, re.MULTILINE).strip().replace(" ", " ")
description = soup.find('h2', {'class': 'heading-40'}).text
text_block = soup.find('div', {'class': 'rich-text-block-9'})
text = ''
for element in text_block:
sub_elements = element.findChildren()
for sub_element in sub_elements:
text += format_tag_text(sub_element)
return {'title': title, 'description': description, 'text': text}
def format_tag_text(tag):
if tag.name == 'br':
return '\n'
elif tag.name == 'em':
return f'_{tag.text}_'
elif tag.name == 'strong':
return f'**{tag.text}**'
else:
return ''
template_urls = get_template_urls(TEMPLATES_PAGE)
for template in template_urls:
print(template)
template_data = get_template_data(template)
OUTPUT.append(template_data)
# print(OUTPUT)
with open('output.json', 'w', encoding='utf-8') as output_file:
json.dump(OUTPUT, output_file, ensure_ascii=False, indent=4)