-
Notifications
You must be signed in to change notification settings - Fork 16
/
davinci.py
165 lines (128 loc) · 5.14 KB
/
davinci.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
import string
import time
TRIE_END = '__end'
INPUT_COMMON_ENGLISH_WORDS = './google-10000-english-usa.txt'
INPUT_TRIGRAMS = './trigrams.txt'
def build_trie():
with open (INPUT_COMMON_ENGLISH_WORDS, 'r') as english_words:
data = [line.strip() for line in english_words.readlines()
if line.strip()]
root = {}
for word in data:
current_dict = root
for letter in word:
current_dict = current_dict.setdefault(letter, {})
current_dict[TRIE_END] = True
print('Trie contains %s words' % len(data))
return root
def build_english_dict():
with open (INPUT_COMMON_ENGLISH_WORDS, 'r') as english_words:
return set(line.strip() for line in english_words.readlines()
if line.strip() and len(line.strip()) > 1)
def find_bigrams(input_list):
return zip(input_list, input_list[1:])
def is_trie_prefix(trie, word):
if not word or not word.strip():
return False
current_dict = trie
for letter in word:
if letter in current_dict:
current_dict = current_dict[letter]
else:
return False
return True
TRIE = build_trie()
ENGLISH_DICT = build_english_dict()
skipped_recursion = 0
def recurse(input_letters, current_permu, solution, final_len, word_end_index):
global skipped_recursion
# current permutation matches original anagram length, see if words
# qualify as real sentence and dedupe, if valid, add to solution set
if len(current_permu) == final_len:
last_word = ''.join(current_permu[word_end_index:])
candidate_word = ''.join(current_permu)
if last_word in ENGLISH_DICT and \
candidate_word not in solution:
solution = solution.add(candidate_word)
# print(candidate_word.upper())
return
seen_at_start = set() # dedupe repeated characters in recursive depth
for idx in range(len(input_letters)):
cur = input_letters.pop(idx)
if cur in seen_at_start:
skipped_recursion += 1
input_letters.insert(idx, cur)
continue
seen_at_start.add(cur)
current_permu.append(cur)
# spaces get special treatment since they distinguish words,
# if a word is not valid english (via trie) we stop recursing
if cur == ' ':
# cut off extra space
candidate_word = ''.join(current_permu[word_end_index:-1])
if candidate_word not in ENGLISH_DICT: # not is_trie_word(TRIE, candidate_word):
# don't recurse - recent word chunk not valid
skipped_recursion += 1
pass
else:
recurse(
input_letters,
current_permu,
solution,
final_len,
len(current_permu)
)
else:
if is_trie_prefix(TRIE, ''.join(current_permu[word_end_index:])):
recurse(
input_letters,
current_permu,
solution,
final_len,
word_end_index
)
else:
# don't recurse - word isn't valid, skip onto next one
skipped_recursion += 1
pass
input_letters.insert(idx, cur)
current_permu.pop()
def transform(s):
# remove punctution, make lowercase
s = s.translate(None, string.punctuation)
return s.lower()
start = time.time()
# orig_string = 'OH, LAME SAINT'
orig_string = 'O, DRACONIAN DEVIL'
xformed_string = transform(orig_string)
xformed_string = list(xformed_string)
# we sort the input characters so we can dedupe and skip recursive calls
# orig = sorted(xformed_string)
final_anagram_len = len(xformed_string)
solution = set()
recurse(xformed_string, [], solution, final_anagram_len, 0)
print('After permuting the anagram %s has %s combinations' %
(orig_string, len(solution)))
end = time.time()
print('Time elapsed is %s' % (end - start))
print('We skipped recursive calls %s times' % skipped_recursion)
common_ngrams_dict = {}
# now we need to build a map of the english language's most common bigrams, common bigrams
# are referred to as collocations - two words that, together, are unusually common in
# english texts. we use this technique to filter out the nonsense anagram combinations
# to a sane number this file has just over 1M trigrams, about 17MB (that's fine)
with open(INPUT_TRIGRAMS, 'r') as ngrams_data:
clean_lines = [s.strip() for s in ngrams_data.read().splitlines()]
for line in clean_lines:
freq, w1, w2, w3 = line.split()
common_ngrams_dict[(w1, w2, w3)] = int(freq)
print('ngram dictionary contains %s words' % len(common_ngrams_dict))
filtered_solution = []
for anagram in solution:
trigram = tuple(anagram.split())
if trigram in common_ngrams_dict:
filtered_solution.append(anagram)
print('After filtering anagrams without common trigrams, the anagram %s has %s combinations' %
(orig_string, len(filtered_solution)))
for anagram in filtered_solution:
print(anagram.upper())