-
Notifications
You must be signed in to change notification settings - Fork 3
/
extract-wiktionary-translations.py
90 lines (78 loc) · 3.61 KB
/
extract-wiktionary-translations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import gzip
import re
import time
import io
import sys
import argparse
from collections import defaultdict
from iso_639_1_codes import iso_639_1_name2code, iso_639_1_code2name
def extract_dictionary(xml_dump_filename, languages, allow_phrases):
translations = []
current_word = None
current_language = None
is_reading_translations = False
counter = 0
for line in io.open(xml_dump_filename, encoding='utf8'):
# let the user know what's going on.
if counter % 100000 == 0: sys.stdout.write('\r{} lines read'.format(counter))
counter += 1
line = line.strip()
# process new word
if line.startswith('<title>'):
current_word = line[7:-8]
# process current language
elif line.endswith('==') and len(line) > 3 and line[-3] != '=':
language_name = line[line.find('==')+2:-2]
if language_name not in iso_639_1_name2code: iso_639_1_name2code[language_name] = ''
current_language = iso_639_1_name2code[language_name]
# start reading translations
elif line == '=====Translations=====':
is_reading_translations = True
# read a translation (potentially)
elif is_reading_translations and line.startswith('* '):
# read the target language
colon_index = line.find(':')
if colon_index == -1: continue
other_language_name = line[2:colon_index]
if other_language_name not in iso_639_1_name2code: iso_639_1_name2code[other_language_name] = ''
other_language = iso_639_1_name2code[other_language_name]
if not current_language or not other_language: continue
# read the translations
for translation in line[colon_index+1:].split(','):
translation = translation.strip()
#print current_word, '->', translation
if translation[:2] != '{{' or translation[-2:] != '}}': continue
splits = translation[2:-2].split('|')
#print current_word, '->', splits
if len(splits) < 3 or splits[1] != other_language: continue
translated_word = splits[2]
#print current_word, '->', translated_word
# skip phrases if not allowed
if (not allow_phrases) and (current_word.find(' ') > -1 or translated_word.find(' ') > -1): continue
pair = (u'{}:{}'.format(current_language, current_word), u'{}:{}'.format(other_language, translated_word),)
#print pair
translations.append(pair)
# stop reading translations
elif is_reading_translations and line.startswith('='):
is_reading_translations = False
# processed all lines in the xml file
return translations
def main(argv):
# parse/validate arguments
argparser = argparse.ArgumentParser()
argparser.add_argument("-i", "--input_dump", help='English wiktionary articles dump file, uncompressed')
argparser.add_argument("-o", "--output_dictionary", help='A multilingual dictionary of word pairs')
argparser.add_argument("-p", "--allow_phrases", action='store_true', help="Allow phrases")
argparser.add_argument("-l", "--languages", help='Comma-separated list of ISO 639-1 codes of languages of interest')
args = argparser.parse_args()
multilingual_dictionary = extract_dictionary(args.input_dump, args.languages.split(','), args.allow_phrases)
language2count = defaultdict(int)
with io.open(args.output_dictionary, encoding='utf8', mode='w') as multilingual_dictionary_file:
for (word1, word2) in multilingual_dictionary:
language2count[word1[:2]] += 1
language2count[word2[:2]] += 1
multilingual_dictionary_file.write(u'{} ||| {}\n'.format(word1, word2))
for language, count in language2count.iteritems():
print language, count
if __name__ == '__main__':
main(sys.argv)