-
Notifications
You must be signed in to change notification settings - Fork 0
/
classifyNL.py
110 lines (86 loc) · 3.01 KB
/
classifyNL.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import nltk
import json
from nltk.corpus import wordnet
from rake_nltk import Rake
def classify(word, categories):
#Clean up the download functions
#Should happen at a higher level
#try:
# nltk.data.find('tokenizers/punkt')
# nltk.data.find('tokenizers/wordnet')
# nltk.data.find('tokenizers/averaged_perceptron_tagger')
#except LookupError:
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')
category_list = []
mapping = {}
#Iterate keys and build synonym dictionary
#track which category each synonym is for
for k in categories.keys():
syn_k = wordnet.synsets(k, pos=wordnet.NOUN)
#category_list.append(syn_k[0])
#mapping[syn_k[0].name()] = k
for syn in syn_k:
ii = 0
while ii < 3:
category_list.append(syn)
mapping[syn.name()] = k
ii += 1
res = categories
#create new dict to store count of appeared synonyms
synCount = {}
#initialize dict to 0
for key in categories.keys():
synCount[key] = 0
#Calculate synonyms for word
wordsets = wordnet.synsets(word, pos=wordnet.NOUN)
#print(synCount)
if len(wordsets):
for synset in wordsets:
#print("currently working with synset", synset.examples())
for c in category_list:
name = mapping[c.name()]
similarity = synset.path_similarity(c, simulate_root=False)
if similarity != None and similarity > 0.15:
synCount[name] += 1.0
res[name] += similarity / synCount[name]
for key in synCount.keys():
#print( res.get(key, 0), key, synCount[key])
# res[key] = res.get(key, 0) / synCount[key]
synCount[key] = 0
return res
def normalizeScore(categories):
total = 0
for key, value in categories.items():
total += value
if total == 0:
return categories
for keyword, value in categories.items():
categories[keyword] = value / total
return categories
categories = {
'food' : 0.0,
'entertainment' : 0.0,
'society' : 0.0,
'travel' : 0.0,
'dining' : 0.0
}
score = {}
for key in categories.keys():
score[key] = 0.0
text = """
The UK is among the world’s “best-value destinations” to visit in 2018, according to travel guide publisher Lonely Planet, which has released its annual Best in Travel hotlist.
Chosen by its editors, authors and a network of travel experts around the world, the list highlights the top 10 countries, cities, regions and best-value destinations for travellers to visit in the year ahead.
Pointing to the fact that the pound has weakened against “pretty much all currencies” since the Brexit referendum, the UK has now been ranked seventh on the best-value list, along with Poland and La Paz, Bolivia. The number one spot in this category goes to Tallinn, capital of Estonia – a long-time budget city break favourite.
"""
r = Rake()
r.extract_keywords_from_text(text)
keywords = r.get_ranked_phrases_with_scores()
for value, keyword in keywords:
scores = classify(keyword, categories)
for keyword in scores.keys():
score[keyword] = scores[keyword]
print("Scores for", text)
score = normalizeScore(score)
print(score)