-
Notifications
You must be signed in to change notification settings - Fork 14
/
sentences.py
48 lines (37 loc) · 1.58 KB
/
sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import fnmatch
import os
import re
COW_preprocess = re.compile(r"(&.*?;)|((?<=\s)([a-tv-z]|[^a-zA-Z\s]+?))(?=(\s))")
punct = re.compile(r"\s[a-tv-z]\s|(&.*?;)|[^a-z\s]+?", re.I)
class SentenceIter(object):
"""Lazily iterate over sentences which can be spread over multiple files."""
def __init__(self, root, extension='*.txt', iscow=False):
"""
A sentence iterator for use with word2vec and the other corpora. It iterates over all files in a folder and
subfolders with a given extension.
:param root: The root folder.
:param extension: The extension for which to look.
:param iscow: Boolean value which determines the regex used for preprocessing the data.
:return:
"""
self.path = root
self.filenames = []
if iscow:
self.regex = punct
else:
self.regex = COW_preprocess
for root, dirnames, filenames in os.walk(self.path):
for filename in fnmatch.filter(filenames, extension):
self.filenames.append(os.path.join(root, filename))
def __iter__(self):
"""
Corpus iterator which can be passed to the SPPMI and Word2vec functions.
"""
# assume corpus consists of multiple documents
for filename in self.filenames:
for line in open(os.path.join(self.path, filename), encoding='utf-8'):
sent = line.lower().split()
if len(sent) < 5:
continue
else:
yield self.regex.sub(" ", line.lower()).split()