-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgiven-names-bad-sitelinks.py
43 lines (37 loc) · 1.14 KB
/
given-names-bad-sitelinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
from SPARQLWrapper import SPARQLWrapper, JSON
from urllib import parse
limit = 500000
endpoint = "https://query.wikidata.org/bigdata/namespace/wdq/sparql"
sparql = SPARQLWrapper(endpoint)
sparql.setQuery("""
SELECT ?item ?itemLabel ?article WHERE {
?item wdt:P31 ?type .
VALUES ?type {
wd:Q202444
wd:Q11879590
wd:Q18220911
wd:Q3409032
} .
?article schema:about ?item .
SERVICE wikibase:label { bd:serviceParam wikibase:language "en" }
}
""")
sparql.setReturnFormat(JSON)
results = sparql.query().convert()
counter = 0
for result in results["results"]["bindings"]:
item = result['item']['value'].rsplit('/', 1)[-1]
label = result['itemLabel']['value']
sitelink = result['article']['value']
title = parse.unquote(sitelink).split('/')[-1].split(' (')[0]
lang = parse.unquote(sitelink).split('/')[2].split('.')[0]
project = parse.unquote(sitelink).split('/')[2].split('.')[1]
if project == 'wikipedia' and label != title:
print("* {{{{Q|{}}}}} - ({}) [{} {}]".format(
item,
lang,
sitelink,
title))
counter += 1
if counter >= limit:
break