-
Notifications
You must be signed in to change notification settings - Fork 0
/
chunker.py
285 lines (252 loc) · 11.4 KB
/
chunker.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
"""
This module contains a class that does the though work of chunk / unchunking html.
You may look at :py:class:`.boner.HtmlBoner` if you are looking for a more usable class.
"""
import copy
from collections import namedtuple
from lxml import html
#: a sentence, a list of semantic elts to insert inside sentence, the original parent
TextChunk = namedtuple("TextChunk", "text elts parent istail")
#: skip chunk are chunk of html that does not contains sentences
SkipChunk = namedtuple("SkipChunk", "parent")
#: a semantic elt to insert in a sentence at a specific position
Elt = namedtuple("Elt", "elt start end")
def _s(s):
"""return an empty sentence for None
"""
return "" if s is None else s
class HtmlChunker:
"""This utility chunk an html in text chunk, retaining the tags.
You can then re-inject the text in html, while eg. adding a specific class.
"""
#: tags that do not split text. Instead we will retain their position for later construction
#: (cf. https://developer.mozilla.org/en-US/docs/Web/HTML/Element#Inline_text_semantics)
#: but we removed br, as it generally means a different sentence.
TEXT_SEMANTICS = {
"a", "abbr", "b", "bdi", "bdo", "cite", "code", "data", "dfn", "em", "i", "kbd",
"mark", "q", "rp", "rt", "rtc", "ruby", "s", "samp", "small", "span", "strong", "sub",
"sup", "time", "u", "var", "wbr"}
#: we won't search text inside those tags
SKIPPED_TAGS = {
"head", "script", "noscript", "style", "canvas"}
def is_semantic(self, element):
""":return bool: true if element is a "semantic element" (does not split text).
"""
return (
element.tag in self.TEXT_SEMANTICS and
# verify children are also semantics
all(self.is_semantic(e) for e in element))
def shall_skip(self, element):
"""compute if element shall be skipped
"""
return (
element.tag in self.SKIPPED_TAGS or
isinstance(element, html.HtmlComment) or
not bool(element.text_content().strip()))
def _process_semantic_child(self, element, start):
"""process children of elemets which are of type semantic.
This use recursivity.
:param element: the parent html element
:param int start: the actual position in text
:return tuple: the retrieved text
and a list of Elt, semantic sub element with their points of insertion
"""
text = _s(element.text)
elts = []
for child in element:
subtext, subelts = self._process_semantic_child(child, start + len(text))
text += subtext
elts.extend(subelts)
elts = [Elt(element, start, start + len(text))] + elts
text += _s(element.tail)
return text, elts
def chunk_tree(self, tree, tail=None):
"""harvest sentences in html keeping its structure
use recursivity.
:param tree: the html element being processed
:param tail: a list of semantic elements that are part of the tail of this element
:return list: a list of :py:class:`TextChunk`
"""
if tail is None:
tail = []
chunks = []
if self.shall_skip(tree):
chunks.append(SkipChunk(tree))
else:
# get text and children with their additional tail
text = _s(tree.text)
semantics = []
children = []
tails = []
for child in tree:
if self.is_semantic(child):
if not children:
# text before first children, so it's like element text
sub_text, sub_elts = self._process_semantic_child(child, len(text))
text += sub_text
semantics.extend(sub_elts)
else:
# add to last seen children tail
tails[-1].append(child)
else:
# process later
children.append(child)
tails.append([])
# make my text
chunks.append(TextChunk(text, semantics, tree, False))
# add children sentences
for child, child_tail in zip(children, tails):
chunks.extend(self.chunk_tree(child, child_tail))
# process my tail
if tree.tail or tail:
text = _s(tree.tail)
semantics = []
for child in tail:
sub_text, sub_elts = self._process_semantic_child(child, len(text))
text += sub_text
semantics.extend(sub_elts)
chunks.append(TextChunk(text, semantics, tree, True))
return chunks
def _crossing_semantics(self, start, end, semantics):
"""from a list of html semantic elements,
return the one that are included between start and end
:param int start: start of interval to consider
:param int end: end of interval
:param semantics: list of Elt
:return: list of Elt that are to consider for interval, shifted from start
"""
for s in semantics:
elt = None
if s.start >= start and s.end <= end:
elt = s # included
elif s.start <= start and s.end > start:
elt = Elt(s.elt, start, min(end, s.end)) # crossing lower
elif s.end >= end and s.start < end:
elt = Elt(s.elt, max(start, s.start), end) # crossing upper
if elt is not None:
# shift to have start, end relative to actual start
yield Elt(elt.elt, elt.start - start, elt.end - start)
def _introduce_semantics(self, sentence, semantics):
"""utility to unchunk, given a sentence and the list of semantics elements to insert
return an intro text and etree children to rebuild original sentence.
"""
if not semantics:
return sentence, []
intro = sentence[: semantics[0].start]
children = []
while semantics:
child = semantics.pop(0)
included = []
while semantics and semantics[0].end <= child.end:
s = semantics.pop(0)
included.append(Elt(s.elt, s.start - child.start, s.end - child.start))
# recursive
child_intro, child_children = self._introduce_semantics(
sentence[child.start:child.end], included)
html_child = html.Element(
child.elt.tag, attrib=child.elt.attrib)
html_child.text = child_intro
for c in child_children:
html_child.append(c)
# tail is text up to next element
tail_end = semantics[0].start if semantics else None
html_child.tail = sentence[child.end: tail_end]
children.append(html_child)
return intro, children
def _rebuild_text(self, chunk, parent, sentence_classes,
span_id_prefix, no_class_no_span, chunk_num):
"""adding text to parent, sentence by sentence
"""
# sentence by sentence
sentence_num = -1
sentence_start = 0
last_element = parent
grand_parent = parent.getparent()
for sentence, classes in sentence_classes:
if sentence is None:
continue
sentence_num += 1
sentence_end = sentence_start + len(sentence)
# reintroduced semantics elts
crossing = list(
self._crossing_semantics(sentence_start, sentence_end, chunk.elts))
text, children = self._introduce_semantics(sentence, crossing)
if classes or not no_class_no_span:
if isinstance(classes, str):
classes = [classes]
# we enclose text in a span of right class
attrib = {}
if span_id_prefix is not None:
attrib["id"] = "%s%d-%d" % (span_id_prefix, chunk_num, sentence_num)
if classes:
attrib["class"] = " ".join(classes)
span = html.Element("span", attrib=attrib)
span.text = text
for child in children:
span.append(child)
if chunk.istail:
# by chance we are processing in an ordered manner, so we can append
grand_parent.append(span)
else:
parent.append(span)
last_element = span
else:
# we introduce our text
if last_element != parent or chunk.istail:
last_element.tail = _s(last_element.tail) + text
else:
last_element.text = _s(last_element.text) + text
# and children
if children:
_parent = grand_parent if chunk.istail else parent
for child in children:
_parent.append(child)
last_element = children[-1]
sentence_start = sentence_end
def unchunk(self, tree, chunks, chunk_classes, span_id_prefix="chunk-", no_class_no_span=True):
"""given a tree, it's chunks and classifications for sentences in chunks,
rebuild the html with span and classes inside.
:param tree: the tree the chunk where generated from
:param chunks: a list of TextChunk/SkipChunk
:param chunk_classes: a list of list of classes to apply to each text in each chunk
:param span_id_prefix: a prefix for span added to html
:return: a new tree
"""
old_to_new = {} # track elements between old and new tree
for chunk_num, (chunk, sentence_classes) in enumerate(zip(chunks, chunk_classes)):
if isinstance(chunk, SkipChunk):
# just clone
if chunk.parent is not None and chunk.parent.getparent() is not None:
new_parent = old_to_new[chunk.parent.getparent()]
else:
# append to root
new_parent = tree
new = copy.deepcopy(chunk.parent)
# remove tail, as it was processed
new.tail = ""
new_parent.append(new)
old_to_new[chunk.parent] = new
elif not chunk.istail:
old_parent = chunk.parent
# make new parent
new_parent = html.Element(
old_parent.tag, attrib=old_parent.attrib, nsmap=old_parent.nsmap)
# map new and old
old_to_new[old_parent] = new_parent
# add to corresponding node, order is conserved by chunks so it works well
old_ancestor = old_parent.getparent()
if old_ancestor is not None:
new_ancestor = old_to_new[old_ancestor]
new_ancestor.append(new_parent)
else:
# this is tail so append text to parent
if chunk.parent is not None:
new_parent = old_to_new[chunk.parent]
else:
# append to root
new_parent = tree
if not isinstance(chunk, SkipChunk):
self._rebuild_text(
chunk, new_parent, sentence_classes,
span_id_prefix, no_class_no_span, chunk_num)
return old_to_new[tree]