-
Notifications
You must be signed in to change notification settings - Fork 3
/
prune-long-lines.py
27 lines (24 loc) · 958 Bytes
/
prune-long-lines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
import re
import time
import io
import sys
import argparse
from collections import defaultdict
# parse/validate arguments
argParser = argparse.ArgumentParser()
argParser.add_argument("-tokens", type=str, help="prune line if it has more than this many tokens")
argParser.add_argument("-in", "--input_filename", type=str, help="input filename")
argParser.add_argument("-out", "--output_filename", type=str, help="output filename")
argParser.add_argument("-ie", "--input_encoding", type=str, default='utf8')
argParser.add_argument("-oe", "--output_encoding", type=str, default='utf8')
args = argParser.parse_args()
counter = 0
of = io.open(args.output_filename, encoding=args.output_encoding, mode='w')
for line in io.open(args.input_filename, encoding=args.input_encoding, mode='r'):
if len(line.split()) <= int(args.tokens):
#print len(line.split())
of.write(line)
else:
counter += 1
of.close()
print '{0} lines pruned out'.format(counter)