-
Notifications
You must be signed in to change notification settings - Fork 3
/
load_dataset.py
62 lines (46 loc) · 1.96 KB
/
load_dataset.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import json
import sys
import re
import gzip
from functools import reduce
internal_links_re = re.compile("\[\[([^\]]+)\]\]")
external_links_re = re.compile("\[([a-z]+:[^\]\s]+)\s+([^\]]+)\]")
cross_links_re = re.compile("\{\{([^\}]+)\}\}")
html_tags_re = re.compile("<[^>]+>")
filter_title_re = re.compile("^(File:|User:|Talk:|User talk:|Template:|User blog comment:|" \
"Board Thread:|File talk:|Thread:|User blog:|Template talk:|" \
"Category:|Board:|Quiz:|QuizArticle:|Category talk:|Forum:|" \
"MediaWiki:|MediaWiki talk:|Kategorie:|Datei:|Wikipedia:)")
def load_wikidata(filename):
if filename.endswith(".gz"):
return load_wikidata_gzip(filename)
else:
return load_wikidata_normal(filename)
def load_wikidata_normal(filename):
with open(filename, "rt", encoding="UTF-8") as f:
for line in f:
doc = json.loads(line)
if not filter_title(doc["title"]):
doc["byteOffset"] = 0
yield doc
def load_wikidata_gzip(filename):
with gzip.open(filename, "rt", encoding="UTF-8") as f:
for line in f:
doc = json.loads(line)
if not filter_title(doc["title"]):
doc["byteOffset"] = 0
yield doc
def filter_title(title):
return bool(filter_title_re.match(title))
def find_internal_links(doc_text):
return map(lambda a: a.group(1), internal_links_re.finditer(doc_text))
def multi_sub(re_list, replacement, subject):
return reduce(lambda r, a: re.sub(a, replacement, r), re_list, subject)
def clean_wikidata(doc_text):
return \
re.sub(external_links_re, lambda m: m.group(2),
re.sub(internal_links_re, lambda m: m.group(1),
multi_sub([html_tags_re, cross_links_re], "", doc_text)))
def load_wikidata_texts(filename):
for doc in load_wikidata(filename):
yield clean_wikidata(doc["text"])