-
Notifications
You must be signed in to change notification settings - Fork 2
/
HTMLTableParser.py
103 lines (90 loc) · 2.58 KB
/
HTMLTableParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
# A simple HTML table parser. It turns tables (including nested tables) into arrays
# Nigel Sim <[email protected]>
# http://simbot.wordpress.com
from HTMLParser import HTMLParser
import re, string, os
from string import lower
class Table(list):
pass
class Row(list):
pass
class Cell(object):
def __init__(self):
self.data = None
return
def append(self,item):
if self.data != None:
print "Overwriting %s"%self.data
self.data = item
# Get the item on the top of a stack
def top(x):
return x[len(x)-1]
class TableParser(HTMLParser):
def __init__(self, parser=None):
"""
The parser is a method which will be passed the doc at the end
of the parsing. Useful if TableParser is within an inner loop and
you want to automatically process the document. If it is omitted then
it will do nothing
"""
self._tag = None
self._buf = None
self._attrs = None
self.doc = None # Where the document will be stored
self._stack = None
self._parser = parser
self.reset()
return
def reset(self):
HTMLParser.reset(self)
self.doc = []
self._stack = [self.doc]
self._buf = ''
def close(self):
HTMLParser.close(self)
if self._parser != None:
self._parser(self.doc)
def handle_starttag(self, tag, attrs):
self._tag = tag
self._attrs = attrs
if lower(tag) == 'table':
self._buf = ''
self._stack.append(Table())
elif lower(tag) == 'tr':
self._buf = ''
self._stack.append(Row())
elif lower(tag) == 'td':
self._buf = ''
self._stack.append(Cell())
#print "Encountered the beginning of a %s tag" % tag
def handle_endtag(self, tag):
if lower(tag) == 'table':
t = None
while not isinstance(t, Table):
t = self._stack.pop()
r = top(self._stack)
r.append(t)
elif lower(tag) == 'tr':
t = None
while not isinstance(t, Row):
t = self._stack.pop()
r = top(self._stack)
r.append(t)
elif lower(tag) == 'td':
c = None
while not isinstance(c, Cell):
c = self._stack.pop()
t = top(self._stack)
if isinstance(t, Row):
# We can not currently have text and other table elements in the same cell.
# Table elements get precedence
if c.data == None:
t.append(self._buf)
else:
t.append(c.data)
else:
print "Cell not in a row, rather in a %s"%t
self._tag = None
#print "Encountered the end of a %s tag" % tag
def handle_data(self, data):
self._buf += data