-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSOCC_cleanup.py
29 lines (26 loc) · 1.09 KB
/
SOCC_cleanup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
# ----------------------------------------------------------------
# Clean SOCC files.
#
# (C) 2020 Laurens Bosman, Discourse Processing Lab, SFU
# Released under GNU General Public License (GPL)
# email [email protected]
# ----------------------------------------------------------------
import os
import re
def main(corpusFolder):
os.mkdir('./SOCC_clean/')
for file in os.listdir(corpusFolder):
with open('./SOCC_clean/' + file, 'w') as output:
with open(corpusFolder + file, 'r') as inputFile:
for line in inputFile:
first = re.sub(r'<.{1,3}>', '', line)
second = re.sub(r'_{4,100}', '', first)
third = re.sub(r'-{4,100}', '', second)
fourth = re.sub(r'~{4,100}', '', third)
output.write(fourth)
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser(description='remove artifacts from data collection process')
parser.add_argument('corpusPath', type=str, help='the path to the corpus folder')
args = parser.parse_args()
main(args.corpusPath)