-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathtxt_to_tsv.py
115 lines (90 loc) · 4.09 KB
/
txt_to_tsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import argparse
from pathlib import Path
import csv
import json
from typing import List, Dict, Iterable
import re
from test_collection import check_file
def read_comment_txt_file(txt_file_path: str) -> Iterable[str]:
'''
Given a text file path it will return all lines in the text file that are
not comment lines.
A comment is any line that starts with a `#` and contains no tab (`\t`)
characters within the comment line.
# Parameters
txt_file_path : `str`
File path to the text file
# Returns
`Iterable[str]`
# Raises
ValueError
If a comment line contains a tab (`\t`)
'''
comment_pattern = re.compile(r'#.*')
tab_pattern = re.compile(r'\t')
with Path(txt_file_path).open('r', newline='', encoding='utf-8-sig') as text_file:
for line_number, line in enumerate(text_file):
# All comment lines are those with a # and contain no tab.
if (comment_pattern.match(line) and
tab_pattern.search(line) is None):
continue
comment_error = ('\n\nA comment line cannot contain a tab.\n\n'
f'This occurred on line number: {line_number}\n\n'
f'In the following file: {txt_file_path}\n\n'
f'The line contains the following: {line}\n\n')
comment_error = '\n' + ('-' * 50) + comment_error + ('-' * 50)
if (comment_pattern.match(line) and
tab_pattern.search(line) is not None):
raise ValueError(comment_error)
line = line.replace('\ufeff', '')
if line.strip():
yield line
def txt_to_tsv(txt_file_path: str, tsv_file_path: str) -> None:
'''
Converts the text file into a TSV file. This conversion will remove all
comments.
A comment is any line that starts with a `#` and contains no tab (`\t`)
characters within the comment line.
# Parameters
txt_file_path : `str`
File path to the text file
tsv_file_path : `str`
File path to the TSV file
'''
txt_tsv_reader = csv.DictReader(read_comment_txt_file(txt_file_path),
delimiter='\t')
with Path(tsv_file_path).open('w', newline='', encoding='utf-8') as tsv_file:
tsv_writer = csv.DictWriter(tsv_file,
fieldnames=txt_tsv_reader.fieldnames,
delimiter='\t')
tsv_writer.writeheader()
for row in txt_tsv_reader:
tsv_writer.writerow(row)
if __name__ == '__main__':
description = """
This script does the following:
1. Converts all lexicon files (single and MWE) from text file format to TSV. The
lexicon files are found through the meta data file (language_resources.json).
2. Checks that the TSV files are formatted correctly:
1. The minimum header names exist,
2. All fields/columns have a header name,
3. All lines contain the minimum information e.g. no comment lines exist
in the middle of the file.
"""
arguments = argparse.ArgumentParser(description=description,
formatter_class=argparse.RawDescriptionHelpFormatter)
arguments.parse_args()
json_data = Path(__file__, '..', 'language_resources.json').resolve()
valid_resource_types = set(['single', 'mwe'])
with json_data.open('r') as json_fp:
data = json.load(json_fp)
for language_code, meta_data in data.items():
resources: List[Dict[str, str]] = meta_data['resources']
for resource in resources:
resource_type = resource['data type']
if resource_type not in valid_resource_types:
continue
tsv_resource_file_path = Path(resource['file path']).resolve()
txt_resource_file_path = tsv_resource_file_path.with_suffix('.txt')
txt_to_tsv(txt_resource_file_path, tsv_resource_file_path)
check_file(resource_type, tsv_resource_file_path)