-
Notifications
You must be signed in to change notification settings - Fork 79
/
spacy_example.py
45 lines (37 loc) · 1.46 KB
/
spacy_example.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
# sys.argv[1:] defines Wikipedia page titles
# This example measures WMDs from the first page to all the rest
from collections import Counter
import sys
import numpy
import spacy
import requests
from wmd import WMD
# Load English tokenizer, tagger, parser, NER and word vectors
print("loading spaCy")
nlp = spacy.load("en")
# List of page names we will fetch from Wikipedia and query for similarity
titles = sys.argv[1:] or ["Germany", "Spain", "Google"]
documents = {}
for title in titles:
print("fetching", title)
pages = requests.get(
"https://en.wikipedia.org/w/api.php?action=query&format=json&titles=%s"
"&prop=extracts&explaintext" % title).json()["query"]["pages"]
print("parsing", title)
text = nlp(next(iter(pages.values()))["extract"])
tokens = [t for t in text if t.is_alpha and not t.is_stop]
words = Counter(t.text for t in tokens)
orths = {t.text: t.orth for t in tokens}
sorted_words = sorted(words)
documents[title] = (title, [orths[t] for t in sorted_words],
numpy.array([words[t] for t in sorted_words],
dtype=numpy.float32))
# Hook in WMD
class SpacyEmbeddings(object):
def __getitem__(self, item):
return nlp.vocab[item].vector
calc = WMD(SpacyEmbeddings(), documents)
print("calculating")
# Germany shall be closer to Spain than to Google
for title, relevance in calc.nearest_neighbors(titles[0]):
print("%24s\t%s" % (title, relevance))