-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathadd_sample_sentences.rb
141 lines (118 loc) · 5.17 KB
/
add_sample_sentences.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# This script adds a sample sentence and its translation to each of your cards.
#
# Script is part of Anki Utils by Judith Meyer aka Sprachprofi
# https://github.com/Sprachprofi/anki_utils
require File.join(File.dirname(__FILE__), 'inc_tatoeba_cutdown')
puts "Welcome! This utility will let you add lots of sample sentences to your exported tab-separated flashcards."
puts "Which file contains your cards? The first column must be in your target language."
filename = gets.chomp
raise "Could not find your file" if !File.exists?(filename)
new_filename = filename.sub(/^(.+)\.(\w+)$/, '\1_with_sample_sentences.\2')
puts "What is the Tatoeba code for your target language? E. g. 'eng', 'fra', 'deu', 'spa' ..."
target_lang = gets.chomp
puts "Do you want the translations of the sample sentences as well? If so, enter the appropriate Tatoeba code for the language to translate them to (e. g. 'eng' for English). Leave blank if you don't want a translation."
source_lang = gets.chomp
# Check for trimmed down versions of the Tatoeba database containing target language sentences only
if !File.exists?("tatoeba_sentences_#{target_lang}.txt")
number = cutdown(target_lang)
raise "Are you sure this is the right code for your target language? There are only #{number} sentences for it. If it is correct, try downloading a more recent Tatoeba export into this folder." if number < 100
elsif source_lang != "" and !File.exists?("tatoeba_sentences_#{source_lang}.txt")
number = cutdown(source_lang)
raise "Are you sure this is the right code for your target language? There are only #{number} sentences for it. If it is correct, try downloading a more recent Tatoeba export into this folder." if number < 100
end
puts "Preparing the sentence data..."
# read in Tatoeba's sentence links and put into memory as a Hash
f_links = File.open('tatoeba_links.csv', 'r:ascii')
links = Hash.new
f_links.each_line do |line|
line.strip!
if line != ""
id1, id2 = line.split("\t")
if links[id1]
links[id1] << id2.to_i
else
links[id1] = [id2.to_i]
end
end
end
f_links.close
# read in Tatoeba's target-language sentences and save as Array
f_fi = File.open("tatoeba_sentences_#{target_lang}.txt", "r:utf-8")
finnish_sentences = Array.new
first_sentence = f_fi.first
f_fi.each_line do |line|
line.strip!
if line != ""
id, lang, sentence = line.split("\t")
finnish_sentences[id.to_i] = sentence
end
end
f_fi.close
# read in Tatoeba's source-language sentences and save as Array
english_sentences = Array.new
if source_lang != ""
f_en = File.open("tatoeba_sentences_#{source_lang}.txt", "r:utf-8")
f_en.each_line do |line|
line.strip!
if line != ""
id, lang, sentence = line.split("\t")
english_sentences[id.to_i] = sentence
end
end
f_en.close
end
# ensure that this is a language that has spaces between words
space = " "
space = "" if first_sentence.include?(" ")
puts "Finding sample sentences for your words..."
f_original = File.read(filename) # source file with the words needing sample sentences
f_result = File.open(new_filename, 'w:utf-8') # target file
f_original.each_line do |line| # go over each line of the original word list
line.strip!
chosen_fi_sentence = nil
chosen_en_sentence = nil
word = nil
word, rest = line.split("\t", 2) if line != "" # parse word list
if word and !word.empty?
# try to find a sample_sentence that contains this word
finnish_ids = []
finnish_sentences.each_with_index do |sentence, i|
finnish_ids << i if sentence and sentence.match(space + word + space) # try full word
break if finnish_ids.size > 9
end
# otherwise try with a word without ending (important for European languages)
if finnish_ids.size < 9 and space == " "
finnish_sentences.each_with_index do |sentence, i|
finnish_ids << i if sentence and sentence.match(space + word[0..-1])
break if finnish_ids.size > 9
end
end
# otherwise try with a word that may have a prefix and suffix (important for African languages, Indonesian etc.)
if finnish_ids.size < 9 and space == " "
finnish_sentences.each_with_index do |sentence, i|
finnish_ids << i if sentence and sentence.match(word)
break if finnish_ids.size > 9
end
end
# now check for translations
if source_lang != ""
finnish_ids.each do |fi_id|
possible_translation_ids = links[fi_id.to_s] # these are possible translations, but not necessarily to our source language; many won't exist
en_id = possible_translation_ids.detect { |id| english_sentences[id] } if possible_translation_ids
if en_id
# this is the first sentence with a translation
chosen_fi_sentence = finnish_sentences[fi_id]
chosen_en_sentence = english_sentences[en_id]
break
end
end
end
chosen_fi_sentence ||= finnish_sentences[finnish_ids.first] if !finnish_ids.empty? # otherwise use a Finnish sentence without translation
chosen_fi_sentence ||= ""
chosen_en_sentence ||= ""
puts "Example for #{word}: #{chosen_fi_sentence}"
f_result.puts(line + "\t" + chosen_fi_sentence + "\t" + chosen_en_sentence) # save all previous information plus the new sample sentence and translation
end
end
f_result.close
puts "Done! Your flashcards with sample sentences have been saved in #{new_filename}."