-
Notifications
You must be signed in to change notification settings - Fork 2
/
text_util.py
119 lines (80 loc) · 2.7 KB
/
text_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
import string
# Load the document in memory
def load_doc(filename):
file = open(filename, 'r')
all_text = file.read()
file.close()
return all_text
# Extract descriptions from loaded document
def load_descriptions(doc):
description_map = dict()
# Process per line
for line in doc.split('\n'):
# White space split
tokens = line.split()
if len(line) < 2:
continue
# Image ID, Image Description
image_id, image_desc = tokens[0], tokens[1:]
# Removing filename from Image ID
image_id = image_id.split('.')[0]
# De-tokenize Description by converting back to string
image_desc = ' '.join(image_desc)
# If needed, create list.
if image_id not in description_map:
description_map[image_id] = list()
# Store description
description_map[image_id].append(image_desc)
return description_map
# Clean the descriptions
def clean_descriptions(descriptions):
# Translation table
table = str.maketrans('', '', string.punctuation)
for key, desc_list in descriptions.items():
for i in range(len(desc_list)):
desc = desc_list[i]
# Create tokens
desc = desc.split()
# Lower Case
desc = [word.lower() for word in desc]
# Remove punctuation
desc = [w.translate(table) for w in desc]
# remove hanging 's' and 'a'
desc = [word for word in desc if len(word) > 1]
# Remove tokens with numbers
desc = [word for word in desc if word.isalpha()]
# Convert to string
desc_list[i] = ' '.join(desc)
# Convert Description to vocabulary
def to_vocabulary(descriptions):
# List of Descriptions
all_desc = set()
for key in descriptions.keys():
[all_desc.update(d.split()) for d in descriptions[key]]
return all_desc
# Save descriptions
def save_descriptions(descriptions, filename):
lines = list()
for key, desc_list in descriptions.items():
for desc in desc_list:
lines.append(key + ' ' + desc)
data = '\n'.join(lines)
file = open(filename, 'w')
file.write(data)
file.close()
def main():
filename = 'data/Flickr8k_text/Flickr8k.token.txt'
# Load
doc = load_doc(filename)
# Parse
descriptions = load_descriptions(doc)
print('Loaded:\t' + str(len(descriptions)))
# Clean
clean_descriptions(descriptions)
# Summarize
vocabulary = to_vocabulary(descriptions)
print('Vocabulary Size:\t' + str(len(vocabulary)))
# Save
save_descriptions(descriptions, 'data/descriptions.txt')
if __name__ == "__main__":
main()