-
Notifications
You must be signed in to change notification settings - Fork 0
/
text_utils.py
349 lines (276 loc) · 9.94 KB
/
text_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
"""
Utilities for simple text analysis: word frequencies and co-occurrence graph.
These tools can be used to analyze a plain text file treating it as a list of
newline-separated sentences (e.g. a list of paper titles).
It computes word frequencies (after doing some naive normalization by
lowercasing and throwing away a few overly common words). It also computes,
from the most common words, a weighted graph of word co-occurrences and
displays it, as well as summarizing the graph structure by ranking its nodes in
descending order of eigenvector centrality.
This is meant as an illustration of text processing in Python, using matplotlib
for visualization and NetworkX for graph-theoretical manipulation. It should
not be considered production-strength code for serious text analysis.
Author: Fernando Perez <[email protected]>
"""
#-----------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------
# From the standard library
import os
import re
# import urllib2
# Third-party libraries
import networkx as nx
import numpy as np
from matplotlib import pyplot as plt
plt.rc('figure', figsize=(8, 5))
#-----------------------------------------------------------------------------
# Function definitions
#-----------------------------------------------------------------------------
def rescale_arr(arr, amin, amax):
"""Rescale an array to a new range.
Return a new array whose range of values is (amin, amax).
Parameters
----------
arr : array-like
amin : float
new minimum value
amax : float
new maximum value
Examples
--------
>>> a = np.arange(5)
>>> rescale_arr(a,3,6)
array([ 3. , 3.75, 4.5 , 5.25, 6. ])
"""
# old bounds
m = arr.min()
M = arr.max()
# scale/offset
s = float(amax-amin)/(M-m)
d = amin - s*m
# Apply clip before returning to cut off possible overflows outside the
# intended range due to roundoff error, so that we can absolutely guarantee
# that on output, there are no values > amax or < amin.
return np.clip(s*arr+d,amin,amax)
def all_pairs(items):
"""Make all unique pairs (order doesn't matter)"""
pairs = []
nitems = len(items)
for i, wi in enumerate(items):
for j in range(i+1, nitems):
pairs.append((wi, items[j]))
return pairs
def removal_set(words, query):
"""Create a set of words for removal for a given query."""
rem = set(words.split())
qw = [w.lower() for w in query.split()]
for w in qw:
rem.add(w)
rem.add('#' + w)
qq = ''.join(qw)
rem.add(qq)
rem.add('#' + qq)
return rem
def lines_cleanup(lines, min_length=4, remove = None):
"""Clean up a list of lowercase strings of text for simple analysis.
Splits on whitespace, removes all 'words' less than `min_length` characters
long, and those in the `remove` set.
Returns a list of strings.
"""
remove = set(remove) if remove is not None else []
filtered = []
for line in lines:
a = []
for w in line.lower().split():
wnorm = w.rstrip('.,:').replace('[', '').replace(']', '')
if len(wnorm) >= min_length and wnorm not in remove:
a.append(wnorm)
filtered.append(' '.join(a))
return filtered
def print_vk(lst):
"""Print a list of value/key pairs nicely formatted in key/value order."""
# Find the longest key: remember, the list has value/key paris, so the key
# is element [1], not [0]
longest_key = max([len(word) for word, count in lst])
# Make a format string out of it
fmt = '%'+str(longest_key)+'s -> %s'
# Do actual printing
for k,v in lst:
print(fmt % (k,v))
def word_freq(text):
"""Return a dictionary of word frequencies for the given text.
Input text should be given as an iterable of strings."""
freqs = {}
for word in text:
freqs[word] = freqs.get(word, 0) + 1
return freqs
def sort_freqs(freqs):
"""Sort a word frequency histogram represented as a dictionary.
Parameters
----------
freqs : dict
A dict with string keys and integer values.
Return
------
items : list
A list of (count, word) pairs.
"""
items = freqs.items()
items = sorted(items, key = lambda wc: wc[1])
return items
def summarize_freq_hist(freqs, n=10):
"""Print a simple summary of a word frequencies dictionary.
Paramters
---------
freqs : dict or list
Word frequencies, represented either as a dict of word->count, or as a
list of count->word pairs.
n : int
The number of least/most frequent words to print.
"""
items = sort_freqs(freqs) if isinstance(freqs, dict) else freqs
print('Number of unique words:',len(freqs))
print('')
print('%d least frequent words:' % n)
print_vk(items[:n])
print('')
print('%d most frequent words:' % n)
print_vk(items[-n:])
def co_occurrences(lines, words):
"""Return histogram of co-occurrences of words in a list of lines.
Parameters
----------
lines : list
A list of strings considered as 'sentences' to search for co-occurrences.
words : list
A list of words from which all unordered pairs will be constructed and
searched for co-occurrences.
"""
wpairs = all_pairs(words)
# Now build histogram of co-occurrences
co_occur = {}
for w1, w2 in wpairs:
rx = re.compile('%s .*%s|%s .*%s' % (w1, w2, w2, w1))
co_occur[w1, w2] = sum([1 for line in lines if rx.search(line)])
return co_occur
def co_occurrences_graph(word_hist, co_occur, cutoff=0):
"""Convert a word histogram with co-occurrences to a weighted graph.
Edges are only added if the count is above cutoff.
"""
g = nx.Graph()
for word, count in word_hist:
g.add_node(word, count=count)
for (w1, w2), count in co_occur:
if count<=cutoff:
continue
g.add_edge(w1, w2, weight=count)
return g
# Hack: offset the most central node to avoid too much overlap
rad0 = 0.3
def centrality_layout(wgraph, centrality):
"""Compute a layout based on centrality.
"""
# Create a list of centralities, sorted by centrality value
cent = sorted(centrality.items(), key=lambda x:float(x[1]), reverse=True)
nodes = [c[0] for c in cent]
cent = np.array([float(c[1]) for c in cent])
rad = (cent - cent[0])/(cent[-1]-cent[0])
rad = rescale_arr(rad, rad0, 1)
angles = np.linspace(0, 2*np.pi, len(centrality))
layout = {}
for n, node in enumerate(nodes):
r = rad[n]
th = angles[n]
layout[node] = r*np.cos(th), r*np.sin(th)
return layout
def plot_graph(wgraph, pos=None, fig=None, title=None):
"""Conveniently summarize graph visually"""
# config parameters
edge_min_width= 3
edge_max_width= 12
label_font = 18
node_font = 22
node_alpha = 0.4
edge_alpha = 0.55
edge_cmap = plt.cm.Spectral
# Create figure
if fig is None:
fig, ax = plt.subplots()
else:
ax = fig.add_subplot(111)
fig.subplots_adjust(0,0,1)
# Plot nodes with size according to count
sizes = []
degrees = []
for n, d in wgraph.nodes(data=True):
sizes.append(d['count'] if 'count' in d else 0)
degrees.append(wgraph.degree(n))
sizes = rescale_arr(np.array(sizes, dtype=float), 100, 1000)
# Compute layout and label edges according to weight
pos = nx.spring_layout(wgraph) if pos is None else pos
labels = {}
width = []
for n1, n2, d in wgraph.edges(data=True):
w = d['weight']
labels[n1, n2] = w
width.append(w)
width = rescale_arr(np.array(width, dtype=float), edge_min_width,
edge_max_width)
# Draw
nx.draw_networkx_nodes(wgraph, pos, node_size=sizes, node_color=degrees,
alpha=node_alpha)
nx.draw_networkx_edges(wgraph, pos, width=width, edge_color=width,
edge_cmap=edge_cmap, alpha=edge_alpha)
nx.draw_networkx_edge_labels(wgraph, pos, edge_labels=labels,
font_size=label_font)
nx.draw_networkx_labels(wgraph, pos, font_size=node_font, font_weight='bold')
if title is not None:
ax.set_title(title, fontsize=label_font)
ax.set_xticks([])
ax.set_yticks([])
# Mark centrality axes
kw = dict(color='k', linestyle='-')
cross = [ax.axhline(0, **kw), ax.axvline(rad0, **kw)]
[ l.set_zorder(0) for l in cross]
def plot_word_histogram(freqs, show=10, title=None):
"""Plot a histogram of word frequencies, limited to the top `show` ones.
"""
sorted_f = sort_freqs(freqs) if isinstance(freqs, dict) else freqs
# Don't show the tail
if isinstance(show, int):
# interpret as number of words to show in histogram
show_f = sorted_f[-show:]
else:
# interpret as a fraction
start = -int(round(show*len(freqs)))
show_f = sorted_f[start:]
# Now, extract words and counts, plot
n_words = len(show_f)
ind = np.arange(n_words)
words = [i[0] for i in show_f]
counts = [i[1] for i in show_f]
fig = plt.figure()
ax = fig.add_subplot(111)
if n_words<=20:
# Only show bars and x labels for small histograms, they don't make
# sense otherwise
ax.bar(ind, counts)
ax.set_xticks(ind)
ax.set_xticklabels(words, rotation=45)
fig.subplots_adjust(bottom=0.25)
else:
# For larger ones, do a step plot
ax.step(ind, counts)
# If it spans more than two decades, use a log scale
if float(max(counts))/min(counts) > 100:
ax.set_yscale('log')
if title:
ax.set_title(title)
return ax
def summarize_centrality(centrality):
c = centrality.items()
c = sorted(c, key=lambda x:x[1], reverse=True)
print('\nGraph centrality')
for node, cent in c:
print("%15s: %.3g" % (node, float(cent)))