-
Notifications
You must be signed in to change notification settings - Fork 116
/
cleanhtml.py
executable file
·81 lines (65 loc) · 2.28 KB
/
cleanhtml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
#!/usr/bin/env python3
# Clean up HTML and prettyprint it.
from bs4 import BeautifulSoup
import re
import sys
def remove_empty_tags(soup):
for t in soup.find_all():
innerhtml = t.text
# print "tag", t
# print "innerHTML", innerhtml
if not innerhtml or not innerhtml.strip():
t.extract()
def prettyprint(soup):
"""Simple prettyprinter, just add newlines around block nodes.
No indentation, etc.
Also get rid of non-breaking spaces (c2 a0).
Returns a string.
BS4's prettify() changes the HTML, adding spurious rendered whitespace.
See https://bugs.launchpad.net/beautifulsoup/+bug/1697296
"""
s = str(soup)
print("type s:", type(s), file=sys.stderr)
# Replace non-breaking spaces in the unicode string
# (note: this assumes Python3, so str is unicode):
s = s.replace("\u00A0"," ")
# Newline after start and before end:
for tag in ("html", "head", "body"):
pat = "<%s>" % tag
s = s.replace(pat, pat + "\n")
pat = "</%s>" % tag
s = s.replace(pat, "\n" + pat)
# Newline before start, leave end alone:
for tag in ("li",):
pat = "<%s>" % tag
s = s.replace(pat, "\n" + pat)
# s = re.sub(pat, "\n" + pat, s)
# Newlines both before and after tags:
for tag in ("p", "br", "br /", "ul", "/ul", "ol", "/ol",
"div", "/div", "table"):
pat = "(<%s.*?>)" % tag
s = re.sub(pat, "\n\\1\n", s)
# Header patterns:
s = re.sub("(<h[1-6]>)", "\n\n\\1", s)
s = re.sub("(</h[1-6]>)", "\\1\n", s)
return s
def clean_up_html(soup, remove_images=True):
remove_empty_tags(soup)
for t in soup.findAll('font'):
t.replaceWithChildren()
for t in soup.find_all(class_=re.compile("^m_")):
t.replaceWithChildren()
if remove_images:
for t in soup.findAll("img"):
t.extract()
# Remove all inline style tags:
# for t in soup.findAll(lambda tag: 'style' in tag.attrs):
for t in soup.findAll(style=True):
del t["style"]
return soup
if __name__ == '__main__':
for f in sys.argv[1:]:
with open(f) as infp:
soup = BeautifulSoup(infp, "lxml")
clean = clean_up_html(soup)
print(prettyprint(clean))