Merge pull request #106 from AmyOlex/timex

Major Chrono Update to output TimeML annotations.
AmyOlex · Feb 21, 2021 · 2aa5615 · 2aa5615
2 parents c099f2e + 582279f
commit 2aa5615
Show file tree

Hide file tree

Showing 28 changed files with 1,383 additions and 284 deletions.
diff --git a/Chrono.py b/Chrono.py
@@ -35,6 +35,7 @@
 import argparse
 import os
 import pickle
+import inspect
 
 from chronoML import DecisionTree as DTree
 from chronoML import RF_classifier as RandomForest
@@ -46,6 +47,9 @@
 from keras.models import load_model
 
 debug=False
+
+
+
 ## This is the driver method to run all of Chrono.
 # @param INDIR The location of the directory with all the files in it.
 # @param OUTDIR The location of the directory where you want all the output written.
@@ -56,32 +60,54 @@
 
     ## Parse input arguments
     parser = argparse.ArgumentParser(description='Parse a directory of files to identify and normalize temporal information.')
-    parser.add_argument('-i', metavar='inputdir', type=str, help='path to the input directory.', required=True)
+    parser.add_argument('-i', metavar='inputdir', type=str, help='path to the input directory.', required=False, default=None)
+    parser.add_argument('-I', metavar='i2b2inputdir', type=str, help='path to the i2b2 input directory.', required=False, default=None)
     parser.add_argument('-x', metavar='fileExt', type=str, help='input file extension if exists. Default is and empty string', required=False, default="")
-    parser.add_argument('-o', metavar='outputdir', type=str, help='path to the output directory.', required=True)
+    parser.add_argument('-o', metavar='outputdir', type=str, help='path to the output directory.', required=False, default=None)
+    parser.add_argument('-O', metavar='i2b2outdir', type=str, help='The path to the i2b2 XML output directory.', required=False, default=None)
     parser.add_argument('-m', metavar='MLmethod', type=str, help='The machine learning method to use. Must be one of NN (neural network), DT (decision tree), SVM (support vector machine), NB (naive bayes, default).', required=False, default='NB')
     parser.add_argument('-w', metavar='windowSize', type=str, help='An integer representing the window size for context feature extraction. Default is 3.', required=False, default=3)
     parser.add_argument('-d', metavar='MLTrainData', type=str, help='A string representing the file name that contains the CSV file with the training data matrix.', required=False, default=False)
     parser.add_argument('-c', metavar='MLTrainClass', type=str, help='A string representing the file name that contains the known classes for the training data matrix.', required=False, default=False)
     parser.add_argument('-M', metavar='MLmodel', type=str, help='The path and file name of a pre-build ML model for loading.', required=False, default=None)
+    #parser.add_argument('-r',metavar='includeRelative', type=str2bool, help='Tell Chrono to mark relative phrases temporal words as temporal.', action="store_true", default=False)
+    parser.add_argument('--includeRelative', action="store_true")
 
     args = parser.parse_args()
     ## Now we can access each argument as args.i, args.o, args.r
 
+    #### need to check for input and output of one type here.
+    global dictpath
+    thisfilename = inspect.getframeinfo(inspect.currentframe()).filename
+    thispath = os.path.dirname(os.path.abspath(thisfilename))
+    dictpath = os.path.join(thispath,"dictionary")
+    print("The dictionary path: " + str(dictpath))
+
+
     ## Get list of folder names in the input directory
     indirs = []
     infiles = []
     outfiles = []
-    outdirs = []
-    for root, dirs, files in os.walk(args.i, topdown = True):
-       for name in dirs:
-
-          indirs.append(os.path.join(root, name))
-          infiles.append(os.path.join(root,name,name))
-          outfiles.append(os.path.join(args.o,name,name))
-          outdirs.append(os.path.join(args.o,name))
-          if not os.path.exists(os.path.join(args.o,name)):
-              os.makedirs(os.path.join(args.o,name))
+
+    if args.O is not None:
+        for root, dirs, files in os.walk(args.I, topdown = True):
+
+            files.sort()
+            print("FILELIST: " + str(files))
+            for name in files:
+                indirs.append(os.path.join(args.I))
+                infiles.append(os.path.join(args.I,name))
+                outfiles.append(os.path.join(args.O,name))
+                if not os.path.exists(os.path.join(args.O)):
+                    os.makedirs(os.path.join(args.O))
+    else:
+        for root, dirs, files in os.walk(args.i, topdown = True):
+           for name in dirs:
+                indirs.append(os.path.join(root, name))
+                infiles.append(os.path.join(root,name,name))
+                outfiles.append(os.path.join(args.o,name,name))
+                if not os.path.exists(os.path.join(args.o,name)):
+                    os.makedirs(os.path.join(args.o,name))
 
     ## Get training data for ML methods by importing pre-made boolean matrix
     ## Train ML methods on training data
@@ -141,18 +167,22 @@
         my_chrono_ID_counter = 1
 
         ## parse out the doctime
-        doctime = utils.getDocTime(infiles[f] + ".dct")
+        if args.I is not None:
+            doctime = utils.getDocTime(infiles[f], i2b2=True)
+        else:
+            doctime = utils.getDocTime(infiles[f] + ".dct", i2b2=False)
         if(debug) : print(doctime)
 
         ## parse out reference tokens
-        text, tokens, spans, tags, sents = utils.getWhitespaceTokens(infiles[f]+args.x)
+        raw_text, text, tokens, spans, tags, sents = utils.getWhitespaceTokens(infiles[f]+args.x)
         #my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, remove_stopwords="./Chrono/stopwords_short2.txt")
         my_refToks = referenceToken.convertToRefTokens(tok_list=tokens, span=spans, pos=tags, sent_boundaries=sents)
 
-
+        if(args.includeRelative):
+            print("Including Relative Terms")
 
         ## mark all ref tokens if they are numeric or temporal
-        chroList = utils.markTemporal(my_refToks)
+        chroList = utils.markTemporal(my_refToks, include_relative = args.includeRelative)
 
         if(debug) :
             print("REFERENCE TOKENS:\n")
@@ -165,9 +195,14 @@
                 print(c)
 
 
-        chrono_master_list, my_chrono_ID_counter = BuildEntities.buildChronoList(tempPhrases, my_chrono_ID_counter, chroList, (classifier, args.m), feats, doctime)
+        chrono_master_list, my_chrono_ID_counter, timex_phrases = BuildEntities.buildChronoList(tempPhrases, my_chrono_ID_counter, chroList, (classifier, args.m), feats, doctime)
 
         print("Number of Chrono Entities: " + str(len(chrono_master_list)))
-        utils.write_xml(chrono_list=chrono_master_list, outfile=outfiles[f])
+
+        if args.O is not None:
+            utils.write_i2b2(raw_text, timex_phrases, outfile=outfiles[f])
+        else:
+            utils.write_xml(chrono_list=chrono_master_list, outfile=outfiles[f])
+
 
 
diff --git a/Chrono/BuildEntities.py b/Chrono/BuildEntities.py
@@ -62,8 +62,11 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
     ## Convert to lowercase
     ref_list = referenceToken.lowercase(ref_list)
 
+    ## this list will contain only the phrases that have a temporal component with a scate entity.
+    timex_list = []
+
     for s in TimePhraseList:
-        #print(s)
+        print("\nNOW PARSING PHRASE: " + s.getText() + "\n")
         chrono_tmp_list = []
 
         # this is the new chrono time flags so we don't duplicate effort.  Will ned to eventually re-write this flow.
@@ -78,6 +81,8 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
         chrono_tmp_list, chrono_id, chrono_time_flags = MonthYear.buildMonthOfYear(s, chrono_id, chrono_tmp_list, chrono_time_flags)
         #Parse out Day-of-Month
         chrono_tmp_list, chrono_id, chrono_time_flags = DayOfMonth.buildDayOfMonth(s, chrono_id, chrono_tmp_list, chrono_time_flags)
+        #Parse AMPM before Hour of Day
+        chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
         #Parse out HourOfDay
         chrono_tmp_list, chrono_id, chrono_time_flags = HourOfDay.buildHourOfDay(s, chrono_id, chrono_tmp_list, chrono_time_flags)
         #Parse out MinuteOfHour
@@ -95,7 +100,7 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
 
         chrono_tmp_list, chrono_id = DayOfWeek.buildDayOfWeek(s, chrono_id, chrono_tmp_list)
         chrono_tmp_list, chrono_id, chrono_time_flags = TextMonthAndDay.buildTextMonthAndDay(s, chrono_id, chrono_tmp_list, chrono_time_flags, dct, ref_list)
-        chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
+        #chrono_tmp_list, chrono_id = AMPM.buildAMPM(s, chrono_id, chrono_tmp_list, chrono_time_flags)
         chrono_tmp_list, chrono_id = PartOfDay.buildPartOfDay(s, chrono_id, chrono_tmp_list)
         chrono_tmp_list, chrono_id = PartOfWeek.buildPartOfWeek(s, chrono_id, chrono_tmp_list)
         chrono_tmp_list, chrono_id = Season.buildSeasonOfYear(s, chrono_id, chrono_tmp_list, ref_list)
@@ -106,20 +111,40 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
         chrono_tmp_list, chrono_id = NthFromStart.buildNthFromStart(s, chrono_id, chrono_tmp_list, ref_list)
         chrono_tmp_list, chrono_id = TimeZone.buildTimeZone(s, chrono_id, chrono_tmp_list)
         chrono_tmp_list, chrono_id = Last.buildLast(s, chrono_id, chrono_tmp_list)
+        chrono_tmp_list, chrono_id = Frequency.buildFrequency(s, chrono_id, chrono_tmp_list)
+
 
-    #    print("XXXXXXXXX")
-    #    print(s)
-    #    for e in chrono_tmp_list:
-    #        print(e)
+        print("XXXXXXXXX")
 
+       # if len(chrono_tmp_list) > 0:
+        #    print(s)
+         #   timex_list.append(s)
+          #  for e in chrono_tmp_list:
+           #     print(e)
 
         tmplist, chrono_id = buildSubIntervals(chrono_tmp_list, chrono_id, dct, ref_list)
-        chrono_list = chrono_list+tmplist
+        ## tmplist is a list of ChronoEntities for a single phrase, but can be returned empty
+        ## Need to add ISO conversion here!
+
+        if len(tmplist) > 0:
+            print("Converting phrase to ISO: " + str(s))
+            s.getISO(tmplist)
+            print("ISO Value: " + str(s))
+            print("TIMEX3 String: " + s.i2b2format())
+            timex_list.append(s)
+
+
+
+        chrono_list = chrono_list+tmplist  ##chrono_list is a list of ChronoEntities, and phrase information is lost
+        #print(chrono_list)
+
         #Going to incorporate in future builds
         #chrono_list, chrono_id = buildDuration(s, chrono_id, chrono_list)
         #chrono_list, chrono_id = buildSet(s, chrono_id, chrono_list)
+
+    #print("TIMEX LIST: " + str(timex_list))
 
-    return chrono_list, chrono_id
+    return chrono_list, chrono_id, timex_list
 
 ####
 #END_MODULE
@@ -130,88 +155,11 @@ def buildChronoList(TimePhraseList, chrono_id, ref_list, PIclassifier, PIfeature
 # @param list of ChronoEntities
 # @return List of ChronoEntities with sub-intervals assigned
 def buildSubIntervals(chrono_list, chrono_id, dct, ref_list):
-    year = None
-    month = None
-    day = None
-    hour = None
-    minute = None
-    second = None
-    daypart = None
-    dayweek = None
-    interval = None
-    period = None
-    nth = None
-    nxt = None
-    this = None
-    tz = None
-    ampm = None
-    modifier = None
-    last = None
+
 
-    entity_count = 0
+    year,month,day,hour,minute,second,daypart,dayweek,interval,period,nth,nxt,this,tz,ampm,modifier,last,entity_count = utils.getEntityTypes(chrono_list)
 
-    #print("in Build Subintervals") 
-    ## loop through all entities and pull out the approriate IDs
-    for e in range(0,len(chrono_list)):
-        #print(chrono_list[e].get_id())
-        e_type = chrono_list[e].get_type()
-        #print("E-type: " + e_type)
-
-        if e_type == "Two-Digit-Year" or e_type == "Year":
-            year = e
-            entity_count = entity_count + 1
-            # print("YEAR VALUE: " + str(chrono_list[e].get_value()))
-        elif e_type == "Month-Of-Year":
-            # print("FOUND Month")
-            month = e
-            entity_count = entity_count + 1
-        elif e_type == "Day-Of-Month":
-            day = e
-            entity_count = entity_count + 1
-        elif e_type == "Hour-Of-Day":
-            hour = e
-            entity_count = entity_count + 1
-        elif e_type == "Minute-Of-Hour":
-            minute = e
-            entity_count = entity_count + 1
-        elif e_type == "Second-Of-Minute":
-            second = e
-            entity_count = entity_count + 1
-        elif e_type == "Part-Of-Day":
-            daypart = e
-            entity_count = entity_count + 1
-        elif e_type == "Day-Of-Week":
-            dayweek = e
-            entity_count = entity_count + 1
-        elif e_type == "Calendar-Interval":
-            interval = e
-            entity_count = entity_count + 1
-        elif e_type == "Period":
-            period = e
-            entity_count = entity_count + 1
-        elif e_type == "NthFromStart":
-            nth = e
-            entity_count = entity_count + 1
-        elif e_type == "Next":
-            nxt = e
-            entity_count = entity_count + 1
-        elif e_type == "This":
-            this = e
-            entity_count = entity_count + 1
-
-        elif e_type == "Time-Zone":
-            tz = e
-            entity_count = entity_count + 1
-        elif e_type == "AMPM-Of-Day":
-            ampm = e
-            entity_count = entity_count + 1
-        elif e_type == "Modifier":
-            modifier = e
-            entity_count = entity_count + 1
-        elif e_type == "Last":
-            last = e
-            entity_count = entity_count + 1
-
+
 
 
     ## Now add additional NEXT and LAST entities where needed