Skip to content
This repository has been archived by the owner on Jul 7, 2021. It is now read-only.

Commit

Permalink
v0.69
Browse files Browse the repository at this point in the history
Sentences now has punctuations. Splitting with lookbehind regex. Added
`default_seperator_class`
  • Loading branch information
0xferit committed Oct 20, 2015
1 parent 504c59e commit 5af09d7
Showing 1 changed file with 12 additions and 4 deletions.
16 changes: 12 additions & 4 deletions pipeline_caller.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

author_copyright = "\nCopyright 2015 Ferit Tunçer [email protected]"

version = 0.68
version = 0.69

import sys
import urllib.request
Expand All @@ -29,12 +29,14 @@
import re
import locale


#++ DEFAULTS
token_path = "pipeline.token"
api_url = "http://tools.nlp.itu.edu.tr/SimpleApi"
default_encoding = locale.getpreferredencoding(False)
pipeline_encoding = 'UTF-8'
default_output_dir = "pipeline_caller_output"
default_seperator_class = "[\.\?:;!]"
#-- DEFAULTS

invalid_token_message = ""
Expand All @@ -56,7 +58,7 @@ def conditional_info(to_be_printed):
#-- Functions
def request(params):
try:
result = urllib.request.urlopen(api_url, params)
result = urllib.request.urlopen(api_url, params)
readed_result = result.read().decode("UTF-8")
if readed_result == invalid_token_message:
sys.exit(invalid_token_message)
Expand All @@ -69,6 +71,7 @@ def request(params):
return readed_result
except KeyboardInterrupt:
sys.exit("[FATAL]Terminated by keyboard interrupt.")
warning("{0}1".format(sys.exc_info()))
except:
raise

Expand Down Expand Up @@ -101,7 +104,8 @@ def readInput(path):
full_text = ""
for line in input_file:
full_text += line
sentences = full_text.split('.')
r = re.compile(r'(?<=(?:{}))\s+'.format(default_seperator_class))
sentences = r.split(full_text)
sentence_count = len(sentences)
if re.match("^\s*$", sentences[sentence_count-1]):
sentences.pop(sentence_count-1)
Expand Down Expand Up @@ -131,13 +135,17 @@ def process():
with open(output_path, 'w', encoding=args.encoding) as output_file:
if args.seperate == 0:
conditional_info("[INFO] Processing type: Batch")

params = urllib.parse.urlencode({'tool': args.tool, 'input': full_text, 'token': token}).encode(pipeline_encoding)

output_file.write("{0}\n".format(request(params)))
print("[DONE] It took {0} seconds to process {1} sentences".format(str(time.time()-start_time).split('.')[0], sentence_count))
else:
conditional_info("[INFO] Processing type: Sentence-by-sentence")
for sentence in sentences:

params = urllib.parse.urlencode({'tool': args.tool, 'input': sentence, 'token': token}).encode(pipeline_encoding)

output_file.write("{0}\n".format(request(params)))
conditional_info("[INFO] Processing {0}".format(sentence))
print("[DONE] It took {0} seconds to process all {1} sentences.".format(str(time.time()-start_time).split('.')[0], sentence_count))
Expand All @@ -155,7 +163,7 @@ def process():
conditional_info("[INFO] Pipeline tool: {}".format(args.tool))
process()
except:
warning("{0}".format(sys.exc_info()[1]))
warning("{0}".format(sys.exc_info()))
sys.exit("[FATAL] Terminating.")

#-- Main Block

0 comments on commit 5af09d7

Please sign in to comment.