Skip to content

Commit

Permalink
Merge pull request #106 from AmyOlex/timex
Browse files Browse the repository at this point in the history
Major Chrono Update to output TimeML annotations.
  • Loading branch information
AmyOlex authored Feb 21, 2021
2 parents c099f2e + 582279f commit 2aa5615
Show file tree
Hide file tree
Showing 28 changed files with 1,383 additions and 284 deletions.
71 changes: 53 additions & 18 deletions Chrono.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import argparse
import os
import pickle
import inspect

from chronoML import DecisionTree as DTree
from chronoML import RF_classifier as RandomForest
Expand All @@ -46,6 +47,9 @@
from keras.models import load_model

debug=False



## This is the driver method to run all of Chrono.
# @param INDIR The location of the directory with all the files in it.
# @param OUTDIR The location of the directory where you want all the output written.
Expand All @@ -56,32 +60,54 @@

## Parse input arguments
parser = argparse.ArgumentParser(description='Parse a directory of files to identify and normalize temporal information.')
parser.add_argument('-i', metavar='inputdir', type=str, help='path to the input directory.', required=True)
parser.add_argument('-i', metavar='inputdir', type=str, help='path to the input directory.', required=False, default=None)
parser.add_argument('-I', metavar='i2b2inputdir', type=str, help='path to the i2b2 input directory.', required=False, default=None)
parser.add_argument('-x', metavar='fileExt', type=str, help='input file extension if exists. Default is and empty string', required=False, default="")
parser.add_argument('-o', metavar='outputdir', type=str, help='path to the output directory.', required=True)
parser.add_argument('-o', metavar='outputdir', type=str, help='path to the output directory.', required=False, default=None)
parser.add_argument('-O', metavar='i2b2outdir', type=str, help='The path to the i2b2 XML output directory.', required=False, default=None)
parser.add_argument('-m', metavar='MLmethod', type=str, help='The machine learning method to use. Must be one of NN (neural network), DT (decision tree), SVM (support vector machine), NB (naive bayes, default).', required=False, default='NB')
parser.add_argument('-w', metavar='windowSize', type=str, help='An integer representing the window size for context feature extraction. Default is 3.', required=False, default=3)
parser.add_argument('-d', metavar='MLTrainData', type=str, help='A string representing the file name that contains the CSV file with the training data matrix.', required=False, default=False)
parser.add_argument('-c', metavar='MLTrainClass', type=str, help='A string representing the file name that contains the known classes for the training data matrix.', required=False, default=False)
parser.add_argument('-M', metavar='MLmodel', type=str, help='The path and file name of a pre-build ML model for loading.', required=False, default=None)
#parser.add_argument('-r',metavar='includeRelative', type=str2bool, help='Tell Chrono to mark relative phrases temporal words as temporal.', action="store_true", default=False)
parser.add_argument('--includeRelative', action="store_true")

args = parser.parse_args()
## Now we can access each argument as args.i, args.o, args.r

#### need to check for input and output of one type here.
global dictpath
thisfilename = inspect.getframeinfo(inspect.currentframe()).filename
thispath = os.path.dirname(os.path.abspath(thisfilename))
dictpath = os.path.join(thispath,"dictionary")
print("The dictionary path: " + str(dictpath))


## Get list of folder names in the input directory
indirs = []
infiles = []
outfiles = []
outdirs = []
for root, dirs, files in os.walk(args.i, topdown = True):
for name in dirs:

indirs.append(os.path.join(root, name))
infiles.append(os.path.join(root,name,name))
outfiles.append(os.path.join(args.o,name,name))
outdirs.append(os.path.join(args.o,name))
if not os.path.exists(os.path.join(args.o,name)):
os.makedirs(os.path.join(args.o,name))

if args.O is not None:
for root, dirs, files in os.walk(args.I, topdown = True):

files.sort()
print("FILELIST: " + str(files))
for name in files:
indirs.append(os.path.join(args.I))
infiles.append(os.path.join(args.I,name))
outfiles.append(os.path.join(args.O,name))
if not os.path.exists(os.path.join(args.O)):
os.makedirs(os.path.join(args.O))
else:
for root, dirs, files in os.walk(args.i, topdown = True):
for name in dirs:
indirs.append(os.path.join(root, name))
infiles.append(os.path.join(root,name,name))
outfiles.append(os.path.join(args.o,name,name))
if not os.path.exists(os.path.join(args.o,name)):
os.makedirs(os.path.join(args.o,name))

## Get training data for ML methods by importing pre-made boolean matrix
## Train ML methods on training data
Expand Down Expand Up @@ -141,18 +167,22 @@
my_chrono_ID_counter = 1

## parse out the doctime
doctime = utils.getDocTime(infiles[f] + ".dct")
if args.I is not None:
doctime = utils.getDocTime(infiles[f], i2b2=True)
else:
doctime = utils.getDocTime(infiles[f] + ".dct", i2b2=False)
if(debug) : print(doctime)

## parse out reference tokens
text, tokens, spans, tags, sents = utils.getWhitespaceTokens(infiles[f]+args.x)
raw_text, text, tokens, spans, tags, sents = utils.getWhitespaceTokens(infiles[f]+args.x)
#my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, remove_stopwords="./Chrono/stopwords_short2.txt")
my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, pos=tags, sent_boundaries=sents)


if(args.includeRelative):
print("Including Relative Terms")

## mark all ref tokens if they are numeric or temporal
chroList = utils.markTemporal(my_refToks)
chroList = utils.markTemporal(my_refToks, include_relative = args.includeRelative)

if(debug) :
print("REFERENCE TOKENS:\n")
Expand All @@ -165,9 +195,14 @@
print(c)


chrono_master_list, my_chrono_ID_counter = BuildEntities.buildChronoList(tempPhrases, my_chrono_ID_counter, chroList, (classifier, args.m), feats, doctime)
chrono_master_list, my_chrono_ID_counter, timex_phrases = BuildEntities.buildChronoList(tempPhrases, my_chrono_ID_counter, chroList, (classifier, args.m), feats, doctime)

print("Number of Chrono Entities: " + str(len(chrono_master_list)))
utils.write_xml(chrono_list=chrono_master_list, outfile=outfiles[f])

if args.O is not None:
utils.write_i2b2(raw_text, timex_phrases, outfile=outfiles[f])
else:
utils.write_xml(chrono_list=chrono_master_list, outfile=outfiles[f])



124 changes: 36 additions & 88 deletions Chrono/BuildEntities.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,11 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
## Convert to lowercase
ref_list = referenceToken.lowercase(ref_list)

## this list will contain only the phrases that have a temporal component with a scate entity.
timex_list = []

for s in TimePhraseList:
#print(s)
print("\nNOW PARSING PHRASE: " + s.getText() + "\n")
chrono_tmp_list = []

# this is the new chrono time flags so we don't duplicate effort. Will ned to eventually re-write this flow.
Expand All @@ -78,6 +81,8 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
chrono_tmp_list, chrono_id, chrono_time_flags = MonthYear.buildMonthOfYear(s, chrono_id, chrono_tmp_list, chrono_time_flags)
#Parse out Day-of-Month
chrono_tmp_list, chrono_id, chrono_time_flags = DayOfMonth.buildDayOfMonth(s, chrono_id, chrono_tmp_list, chrono_time_flags)
#Parse AMPM before Hour of Day
chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
#Parse out HourOfDay
chrono_tmp_list, chrono_id, chrono_time_flags = HourOfDay.buildHourOfDay(s, chrono_id, chrono_tmp_list, chrono_time_flags)
#Parse out MinuteOfHour
Expand All @@ -95,7 +100,7 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature

chrono_tmp_list, chrono_id = DayOfWeek.buildDayOfWeek(s, chrono_id, chrono_tmp_list)
chrono_tmp_list, chrono_id, chrono_time_flags = TextMonthAndDay.buildTextMonthAndDay(s, chrono_id, chrono_tmp_list, chrono_time_flags, dct, ref_list)
chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
#chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
chrono_tmp_list, chrono_id = PartOfDay.buildPartOfDay(s, chrono_id, chrono_tmp_list)
chrono_tmp_list, chrono_id = PartOfWeek.buildPartOfWeek(s, chrono_id, chrono_tmp_list)
chrono_tmp_list, chrono_id = Season.buildSeasonOfYear(s, chrono_id, chrono_tmp_list, ref_list)
Expand All @@ -106,20 +111,40 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
chrono_tmp_list, chrono_id = NthFromStart.buildNthFromStart(s, chrono_id, chrono_tmp_list, ref_list)
chrono_tmp_list, chrono_id = TimeZone.buildTimeZone(s, chrono_id, chrono_tmp_list)
chrono_tmp_list, chrono_id = Last.buildLast(s, chrono_id, chrono_tmp_list)
chrono_tmp_list, chrono_id = Frequency.buildFrequency(s, chrono_id, chrono_tmp_list)


# print("XXXXXXXXX")
# print(s)
# for e in chrono_tmp_list:
# print(e)
print("XXXXXXXXX")

# if len(chrono_tmp_list) > 0:
# print(s)
# timex_list.append(s)
# for e in chrono_tmp_list:
# print(e)

tmplist, chrono_id = buildSubIntervals(chrono_tmp_list, chrono_id, dct, ref_list)
chrono_list = chrono_list+tmplist
## tmplist is a list of ChronoEntities for a single phrase, but can be returned empty
## Need to add ISO conversion here!

if len(tmplist) > 0:
print("Converting phrase to ISO: " + str(s))
s.getISO(tmplist)
print("ISO Value: " + str(s))
print("TIMEX3 String: " + s.i2b2format())
timex_list.append(s)



chrono_list = chrono_list+tmplist ##chrono_list is a list of ChronoEntities, and phrase information is lost
#print(chrono_list)

#Going to incorporate in future builds
#chrono_list, chrono_id = buildDuration(s, chrono_id, chrono_list)
#chrono_list, chrono_id = buildSet(s, chrono_id, chrono_list)

#print("TIMEX LIST: " + str(timex_list))

return chrono_list, chrono_id
return chrono_list, chrono_id, timex_list

####
#END_MODULE
Expand All @@ -130,88 +155,11 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
# @param list of ChronoEntities
# @return List of ChronoEntities with sub-intervals assigned
def buildSubIntervals(chrono_list, chrono_id, dct, ref_list):
year = None
month = None
day = None
hour = None
minute = None
second = None
daypart = None
dayweek = None
interval = None
period = None
nth = None
nxt = None
this = None
tz = None
ampm = None
modifier = None
last = None


entity_count = 0
year,month,day,hour,minute,second,daypart,dayweek,interval,period,nth,nxt,this,tz,ampm,modifier,last,entity_count = utils.getEntityTypes(chrono_list)

#print("in Build Subintervals")
## loop through all entities and pull out the approriate IDs
for e in range(0,len(chrono_list)):
#print(chrono_list[e].get_id())
e_type = chrono_list[e].get_type()
#print("E-type: " + e_type)

if e_type == "Two-Digit-Year" or e_type == "Year":
year = e
entity_count = entity_count + 1
# print("YEAR VALUE: " + str(chrono_list[e].get_value()))
elif e_type == "Month-Of-Year":
# print("FOUND Month")
month = e
entity_count = entity_count + 1
elif e_type == "Day-Of-Month":
day = e
entity_count = entity_count + 1
elif e_type == "Hour-Of-Day":
hour = e
entity_count = entity_count + 1
elif e_type == "Minute-Of-Hour":
minute = e
entity_count = entity_count + 1
elif e_type == "Second-Of-Minute":
second = e
entity_count = entity_count + 1
elif e_type == "Part-Of-Day":
daypart = e
entity_count = entity_count + 1
elif e_type == "Day-Of-Week":
dayweek = e
entity_count = entity_count + 1
elif e_type == "Calendar-Interval":
interval = e
entity_count = entity_count + 1
elif e_type == "Period":
period = e
entity_count = entity_count + 1
elif e_type == "NthFromStart":
nth = e
entity_count = entity_count + 1
elif e_type == "Next":
nxt = e
entity_count = entity_count + 1
elif e_type == "This":
this = e
entity_count = entity_count + 1

elif e_type == "Time-Zone":
tz = e
entity_count = entity_count + 1
elif e_type == "AMPM-Of-Day":
ampm = e
entity_count = entity_count + 1
elif e_type == "Modifier":
modifier = e
entity_count = entity_count + 1
elif e_type == "Last":
last = e
entity_count = entity_count + 1




## Now add additional NEXT and LAST entities where needed
Expand Down
Loading

0 comments on commit 2aa5615

Please sign in to comment.