-
Notifications
You must be signed in to change notification settings - Fork 0
/
zohar_process_root.py
172 lines (145 loc) · 7.67 KB
/
zohar_process_root.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
import requests
import re
import argparse
import tqdm
import sys
import os
import shutil
from datetime import datetime
from p_tqdm import p_map
from zohar_download_article import download, MissingLanguage
from zohar_preprocess_file import process, Options
from zohar_split_heuristic import split_and_save
from zohar_post_processing import combine_discard_non_matching
from zohar_create_summary import save_summary
SAMPLE_URL = 'https://kabbalahmedia.info/he/sources/yUcfylRm'
LINK_REGEX = re.compile(r'div id\=\"title\-[A-Za-z0-9]+\"')
SAMPLES_DEST = {'yUcfylRm': 'zohar',
}
def sources_list(base=SAMPLE_URL):
"This functions extract list of article ids from kabbalahmedia.info web page"
LINK_REGEX = re.compile(r'div id\=\"title\-[A-Za-z0-9]+\"')
src = requests.get(base).text
links = LINK_REGEX.findall(src)
return list({link[len('div id="title-'):-1] for link in links})
def utctime():
"this function returns utc time string"
return datetime.utcnow().strftime("%Y_%m_%d_%H_%M_%S_%f")
def download_function(src_and_args):
src, args = src_and_args
dest_folder = f'{args.dest}_{args.source}_{args.target}'
langs = (args.source, args.target)
try:
paths, filename, base = download(src, dest_folder, langs)
return paths, filename, base, src
except MissingLanguage:
return
except Exception as e:
print('Failed downloading', src, file=sys.stderr)
return
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--root", default=SAMPLE_URL, help="root tree url")
parser.add_argument("--dest", default='zohar', help="destination directory")
parser.add_argument("--source", default='he', help="source language")
parser.add_argument("--target", default='en', help="target language")
parser.add_argument("--skip-process", dest='skip', action='store_true', help="skip processing (only download)")
parser.add_argument("--no-skip-process", dest='skip', action='store_false', help="do not skip processing (default)")
parser.add_argument("--chunk", choices=['paragraphs', 'sentences', 'chars', 'joined'], default='paragraphs')
parser.add_argument("--n_chars_tgt", help='number of chars in the target phrase', default=6000)
parser.add_argument("--n_chars_src", help='number of chars in the source phrase', default=6000)
parser.add_argument("--split_ratio", help='only keep sentences which abide by this ratio, pass 0. to disable',
default=3)
parser.add_argument("--discard-non-matching",
help='discard letters (Ot) with different number of chunks in hebrew and in english in split heuristic',
action='store_true', dest='strict')
parser.add_argument("--no-discard-non-matching",
help='do not discard letters (Ot) with different number of chunks in hebrew and in english split heuristic',
action='store_false', dest='strict')
parser.add_argument("--no-combine-letters", help='do not combine letters (Ot) up to the words threshold',
action='store_false', dest='combine_letters')
parser.add_argument("--words_threshold",
help="number of words below which the Ot is not split (pass 0 to skip split heuristic)",
default=1500)
parser.add_argument("--split_extension", help="extension of split files", default='.split.txt')
parser.add_argument("--min_ratio", help="minimum target/source ratio", default=0.33)
parser.add_argument("--max_ratio", help="maximum target/source ratio", default=3)
parser.add_argument("--summary_name", help="html summary file name", default="summary.html")
for root, dest in SAMPLES_DEST.items():
dest = 'zohar'
parser.set_defaults(skip=False)
parser.set_defaults(strict=True)
parser.set_defaults(combine_letters=False)
parser.set_defaults(root='https://kabbalahmedia.info/he/sources/'+root, dest=dest)
total_discarded, total_kept, total_letters_processed, total_letters = 0, 0, 0, 0
args = parser.parse_args()
sources = sources_list(args.root)
langs = (args.source, args.target)
block = 'en' in langs
block_list = ['F2LYqFgK', 'lgUtBujx']
block_list += ['RgsXf1yr', 'l0iKNgat', 'ejxH6XIB', '0Tw5KgrU', 'F8yCmcC4', 'oFDvZarW'] # כרך ה
block_list += ['ltB6B56p', '6fnqGEAs', 'k61MXHA7', 'YT80k4ZD', 'UeIJ3Zum', 'xMgjmMW9',
'Ct6vwpOH', 'sL6hDYEe', 'Zz9SZ56R kSvUHrJi'] # כרך ו
block_list += ['rZ0spEg7', 'NAGDvDUF', 'bVA7qjnZ', 'fccexLnt', '2hXS7TB6', 'OgUwF0Cw',
'Ot5gt7tC', 'f72ESoE9'] # כרך ז
# total 308
dest_folder = f'{args.dest}_{args.source}_{args.target}'
shutil.rmtree(dest_folder, ignore_errors=True)
dad = os.listdir('dad_book_5/')
assert len(dad) == 6
for f in dad:
shutil.copytree(f'dad_book_5/{f}', f'{dest_folder}/{f}')
if block:
sources = [s for s in sources if s not in block_list]
# sources = sources[:3] # OREN
all_res = p_map(download_function, [(s, args) for s in sources])
all_res = [a for a in all_res if a is not None]
dad_res = []
for f in dad:
paths = [('he', f'{dest_folder}/{f}/he.docx'), ('en', f'{dest_folder}/{f}/en.docx')]
filename = f'eng_{f}.docx'
base = f'{dest_folder}/{f}'
src = f
dad_res.append((paths, filename, base, src))
all_res += dad_res
for paths, filename, base, src in all_res:
if args.skip:
continue
lang_paths = dict(paths)
tgt_path = lang_paths[args.target]
src_path = lang_paths[args.source]
ts = utctime()
postfix = '.' + ts + '.txt'
opts = [Options(tgt_path, args.chunk, args.target, args.n_chars_tgt),
Options(src_path, args.chunk, args.source, args.n_chars_src)]
process(opts, postfix)
tgt_split = tgt_path + '.' + src + args.split_extension
src_split = src_path + '.' + src + args.split_extension
tgt_path += postfix
src_path += postfix
sep = '\n'
if args.words_threshold:
atomic_line = args.chunk != 'joined'
letters_processed, n_letters = split_and_save(tgt_path, src_path, langs,
args.words_threshold, atomic_line,
tgt_split, src_split, args.split_ratio)
total_letters_processed += letters_processed
total_letters += n_letters
tgt_path = tgt_split
src_path = src_split
sep = '\n'
if args.strict:
discarded, kept = combine_discard_non_matching(tgt_path, src_path, langs, sep,
args.words_threshold, args.combine_letters)
total_discarded += discarded
total_kept += kept
summary = os.path.join(base, args.summary_name)
with open(summary, 'w', encoding='utf-8') as f:
save_summary(tgt_path, src_path, langs,
sep, args.min_ratio, args.max_ratio, filename, ts, f)
if args.strict:
print('# Words processed, total words:',
total_letters_processed, total_letters, total_letters_processed / total_letters)
print('Total chunks kept, discarded during validation:', total_kept, total_discarded)
if __name__ == "__main__":
main()