-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhteaml_parser.py
65 lines (57 loc) · 2.41 KB
/
hteaml_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
#
# scrapes pages for Verbal Infusions
# Author: Emily Quinn Finney
#
# Fixes:
# Refactor a few functions so that their purpose is obvious and as generalizable as possible
#
from bs4 import BeautifulSoup
import re
try:
import psyco
psyco.full()
except ImportError:
pass
def read_soup(filename):
"""
Given a text file structured as prettified Beautiful Soup HTML, returns a Beautiful Soup object
:param filename: the name of the file from which to read the HTML
:return: structured_page, the Beautiful Soup object
"""
with open(filename, 'r') as f:
page = f.read()
html_header = "<!DOCTYPE html>"
all_pages = page.split(sep=html_header)
# turn text file into a Beautiful Soup object
structured_pages = []
for web_page in all_pages:
page = BeautifulSoup(''.join([html_header, web_page]), 'lxml')
yield page
def locate_descriptive_text(structured_page, tag_pattern, tag_names, filename):
"""
Given a page structured in a Beautiful Soup format, returns all descriptive text on page
:param structured_page: list of web pages, structured in Beautiful Soup format
:param tag_pattern: string, the HTML tag pattern for which to search
:param tag_names: tuple of strings, the names of the relevant HTML tags to write
:param filename: string, the name of the file to which to write the text
:return: nothing, but should write a corpus of text to file
"""
with open(filename, 'a') as f:
# identify all descriptions on web page matching the pattern in the PageScraper object
prod_description = structured_page.find_all('div', class_=re.compile(tag_pattern))
# then go through them
# this method is going to depend on the structure of the web page
# so I'm not sure how I would generalize it
for prod in prod_description:
if prod['class'][0] == tag_names[0]:
for element in prod:
if element.string:
f.write(element.string)
elif prod['class'][0] == tag_names[1]:
if prod.ul:
for child in prod.ul.children:
if child:
if child.string:
f.write(child.string)
else:
pass