Merge pull request #194 from amir-zeldes/dev

V10.2.0
amir-zeldes · Nov 27, 2024 · aaa74a3 · aaa74a3
2 parents 9df08e9 + cdb7099
commit aaa74a3
Show file tree

Hide file tree

Showing 2,864 changed files with 937,464 additions and 107,599 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -1,4 +1,4 @@
-This corpus was built on data obtained from three different sources. The underlying texts 
+This corpus was built on data obtained from different sources. The underlying texts 
 are licensed under the following licenses:
 
 
@@ -39,7 +39,7 @@ The annotations were produced by the following people:
   * Arianna Janoff
   * Aryaman Arora
   * Ayan Mandal
-  * Aysenur Sagdic
+  * Ayşenur Sağdıç
   * Bertille Baron
   * Bradford Salen
   * Brandon Tullock

diff --git a/README.md b/README.md
@@ -126,10 +126,11 @@ The corpus is downloadable in multiple formats. Not all formats contain all anno
     * ontogum/ - alternative version of coreference annotation in CoNLL, tsv and CoNLL-U formats following OntoNotes guidelines (see Zhu et al. 2021)
   * dep/ - Dependency trees using Universal Dependencies, enriched with metadata, summaries, sentence types, speaker information,  enhanced dependencies, entities, information status, salience, centering, coreference, bridging, Wikification, XML markup, morphological tags/segmentation, CxG constructions, discourse relations/connectives/signals, and Universal POS tags according to the UD standard
   * paula/ - The entire merged corpus (excl. Reddit) in standoff [PAULA XML](https://github.com/korpling/paula-xml), with all annotations
-  * rst/ - Enhanced Rhetorical Structure Theory (eRST) analyses
+  * rst/ - Enhanced Rhetorical Structure Theory (eRST) analyses and other discourse relation annotations
     * rstweb/ - full .rs4 format data as used by RSTTool and rstWeb, with secondary edges + relation signals (recommended)
-    * lisp_nary/ - n-ary lisp trees (.dis format) 
-    * lisp_binary/ - binarized lisp trees (.dis format) 
-    * dependencies/ - a converted RST dependency representation (.rsd format)
+    * lisp_nary/ - n-ary basic RST lisp trees (.dis format) 
+    * lisp_binary/ - binarized basic RST lisp trees (.dis format) 
+    * dependencies/ - a converted eRST dependency representation with secondary edges in a separate column (.rsd format)
     * disrpt/ - plain segmentation, connective detection and relation-per-line data formats following the DISRPT shared task specification
+    * gdtb/ - shallow discourse relations following PDTB v3 guidelines in two formats: PDTB pipes and DISRPT .rels
   * xml/ - vertical XML representations with 1 token or tag per line, metadata, summaries and tab delimited lemmas, morphological segmentation and POS tags (extended VVZ style, vanilla, UPOS and CLAWS5, as well as dependency functions), compatible with the IMS Corpus Workbench (a.k.a. TreeTagger format).
diff --git a/_build/build_gum.py b/_build/build_gum.py
@@ -31,18 +31,21 @@ def setup_directories(gum_source, gum_target):
 
 
 parser = ArgumentParser()
-parser.add_argument("-t",dest="target",action="store",help="GUM build target directory", default=None)
-parser.add_argument("-s",dest="source",action="store",help="GUM build source directory", default=None)
+parser.add_argument("-t",dest="target",action="store",help="Build target directory", default=None)
+parser.add_argument("-s",dest="source",action="store",help="Build source directory", default=None)
 parser.add_argument("-p",dest="parse",action="store_true",help="Whether to reparse constituents")
 parser.add_argument("-c",dest="claws",action="store_true",help="Whether to reassign claws5 tags")
 parser.add_argument("-v",dest="verbose_pepper",action="store_true",help="Whether to print verbose pepper output")
 parser.add_argument("-n",dest="no_pepper",action="store_true",help="No pepper conversion, just validation and file fixing")
 parser.add_argument("-i",dest="increment_version",action="store",help="A new version number to assign",default="DEVELOP")
+parser.add_argument("--rsd_algorithm",choices=["li","hirao","chain"],action="store",help="Discourse dependency conversion algorithm",default="li")
+parser.add_argument("--disrpt_outmode",choices=["standoff","standoff_reltype","standoff_key","compact"],help="DISRPT rels format output style",default="standoff_reltype")
 parser.add_argument("--pepper_only",action="store_true", help="Just rerun pepper on generated targets")
+parser.add_argument("--discourse_only",action="store_true", help="Just rerun discourse relation output formats generation")
 parser.add_argument("--skip_ptb_labels",action="store_true", help="Skip projecting function labels to PTB trees")
 parser.add_argument("--skip_ontogum",action="store_true", help="Skip building OntoGUM version of coref data")
-parser.add_argument("--no_secedges",action="store_true", help="No RST++ secedges in conllu")
-parser.add_argument("--no_signals",action="store_true", help="No RST++ signals in conllu")
+parser.add_argument("--no_secedges",action="store_true", help="No eRST secedges in conllu")
+parser.add_argument("--no_signals",action="store_true", help="No eRST signals in conllu")
 parser.add_argument("--corpus_name",action="store", default="GUM", help="Corpus name / document prefix")
 
 options = parser.parse_args()
@@ -78,7 +81,7 @@ def setup_directories(gum_source, gum_target):
 print("="*20 + "\n")
 
 reddit = check_reddit(gum_source)
-if not reddit:
+if not reddit and options.corpus_name != "GENTLE":
 	print("Could not find restored tokens in reddit documents.")
 	print("Abort conversion or continue without reddit? (You can restore reddit tokens using process_reddit.py)")
 	try:
@@ -204,7 +207,7 @@ def check_diff(xml, ptb, docname):
 #proof(gum_source)
 
 conn_data = {}
-if not options.pepper_only:
+if not options.pepper_only and not options.discourse_only:
 	# Token and sentence border adjustments
 	print("\nAdjusting token and sentence borders:\n" + "="*37)
 	# Adjust tsv/ files:
@@ -215,11 +218,13 @@ def check_diff(xml, ptb, docname):
 	#   * return conllu-a style bracket informatio to add entity data to conllu files later
 	conllua_data, centering_data, salience_data = fix_tsv(gum_source, gum_target, reddit=reddit)
 
+if not options.pepper_only:
 	# Adjust rst/ files:
 	#   * refresh token strings in case of inconsistency
 	#   * note that segment borders are not automatically adjusted around xml/ <s> elements
-	conn_data = fix_rst(gum_source, gum_target, reddit=reddit)
+	conn_data = fix_rst(gum_source, gum_target, reddit=reddit, rsd_algorithm=options.rsd_algorithm)
 
+if not options.pepper_only and not options.discourse_only:
 	# Add annotations to xml/:
 	#   * add CLAWS tags in fourth column
 	#   * add fifth column after lemma containing tok_func from dep/
@@ -292,7 +297,7 @@ def check_diff(xml, ptb, docname):
 	sys.stderr.write("i Pepper only conversion, entities in conllu-a data will be generated from Pepper output (no infsat or min IDs)\n")
 
 ## Step 3: merge and convert source formats to target formats
-if options.no_pepper:
+if options.no_pepper or options.discourse_only:
 	sys.__stdout__.write("\ni Skipping Pepper conversion\n")
 else:
 	sys.__stdout__.write("\nStarting pepper conversion:\n" + "="*30 + "\n")
@@ -309,10 +314,10 @@ def check_diff(xml, ptb, docname):
 			if not reddit and "reddit_" in file_:
 				continue
 			files.append(file_)
-		if not os.path.exists(pepper_tmp + out_dir_name + os.sep + "GUM" + os.sep):
-			os.makedirs(pepper_tmp + out_dir_name + os.sep + "GUM" + os.sep)
+		if not os.path.exists(pepper_tmp + out_dir_name + os.sep + corpus_name + os.sep):
+			os.makedirs(pepper_tmp + out_dir_name + os.sep + corpus_name + os.sep)
 		for file_ in files:
-			shutil.copy(file_, pepper_tmp + out_dir_name + os.sep + "GUM" + os.sep)
+			shutil.copy(file_, pepper_tmp + out_dir_name + os.sep + corpus_name + os.sep)
 	if not os.path.exists(gum_target + "coref" + os.sep + "conll" + os.sep):
 		os.makedirs(gum_target + "coref" + os.sep + "conll" + os.sep)
 
@@ -335,7 +340,7 @@ def check_diff(xml, ptb, docname):
 	meta = io.open(pepper_home + "meta_template.meta", encoding="utf8").read().replace("\r","")
 	meta = meta.replace("**gum_version**",options.increment_version)
 	meta = meta.replace("**build_date**",build_date)
-	meta_out = io.open(pepper_tmp + "xml" + os.sep + "GUM" + os.sep + "GUM.meta",'w')
+	meta_out = io.open(pepper_tmp + "xml" + os.sep + corpus_name + os.sep + corpus_name + ".meta",'w')
 	meta_out.write(meta)
 	meta_out.close()
 
@@ -353,34 +358,45 @@ def check_diff(xml, ptb, docname):
 	quit()
 
 ## Step 4: propagate entity types, coref, discourse relations and XML annotations into conllu dep files
-from utils.propagate import add_entities_to_conllu, add_rsd_to_conllu, add_bridging_to_conllu, add_xml_to_conllu
+from utils.propagate import add_entities_to_conllu, add_rsd_and_pdtb_to_conllu, add_bridging_to_conllu, add_xml_to_conllu
 
-add_entities_to_conllu(gum_target, reddit=reddit, ontogum=False, conllua_data=conllua_data, salience_data=salience_data)
-if not options.skip_ontogum:
-	if options.no_pepper:
-		sys.__stdout__.write("\ni Not adding entity information to UD parses in OntoGUM version since Pepper conversion was skipped\n")
-		add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
-	else:
-		add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
-add_bridging_to_conllu(gum_target,reddit=reddit,corpus=corpus_name)
+if not options.discourse_only:
+	add_entities_to_conllu(gum_target, reddit=reddit, ontogum=False, conllua_data=conllua_data, salience_data=salience_data)
+	if not options.skip_ontogum:
+		if options.no_pepper:
+			sys.__stdout__.write("\ni Not adding entity information to UD parses in OntoGUM version since Pepper conversion was skipped\n")
+			add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
+		else:
+			add_entities_to_conllu(gum_target,reddit=reddit,ontogum=True)
+	add_bridging_to_conllu(gum_target,reddit=reddit,corpus=corpus_name)
 
-sys.__stdout__.write("\no Added entities, coreference and bridging to UD parses\n")
+	sys.__stdout__.write("\no Added entities, coreference and bridging to UD parses\n")
 
-add_rsd_to_conllu(gum_target,reddit=reddit,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
-if not options.skip_ontogum:
-	add_rsd_to_conllu(gum_target,reddit=reddit,ontogum=True,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
-add_xml_to_conllu(gum_target,reddit=reddit,corpus=corpus_name)
+add_rsd_and_pdtb_to_conllu(gum_target,reddit=reddit,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
 if not options.skip_ontogum:
-	add_xml_to_conllu(gum_target,reddit=reddit,ontogum=True,corpus=corpus_name)
-
-sys.__stdout__.write("\no Added discourse relations and XML tags to UD parses\n")
-
-make_disrpt(conn_data,reddit=reddit,corpus="gum")
+	add_rsd_and_pdtb_to_conllu(gum_target,reddit=reddit,ontogum=True,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
 
-sys.__stdout__.write("\no Created DISRPT shared task discourse relation formats in target rst/disrpt/\n")
+if not options.discourse_only:
+	add_xml_to_conllu(gum_target,reddit=reddit,corpus=corpus_name)
+	if not options.skip_ontogum:
+		add_xml_to_conllu(gum_target,reddit=reddit,ontogum=True,corpus=corpus_name)
+	sys.__stdout__.write("\no Added eRST + PDTB discourse relations and XML tags to UD parses\n")
+else:
+	sys.__stdout__.write("\no Added eRST + PDTB discourse relations to UD parses\n")
 
 ## Step 5: Refresh automatic portion of non-DM signals in RST files
-
 sys.__stdout__.write("\no Adding fresh non-DM signals to RST files:\n" + "=" * 37 + "\n")
 update_non_dm_signals(gum_source, gum_target, reddit=reddit)
 
+# TODO:
+#  remove duplicate call to add_rsd_and_pdtb_to_conllu, currently needed because it is both an input of and possibly
+#  modified by output of update_non_dm_signals
+add_rsd_and_pdtb_to_conllu(gum_target,reddit=reddit,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
+if not options.skip_ontogum:
+	add_rsd_and_pdtb_to_conllu(gum_target,reddit=reddit,ontogum=True,output_signals=not options.no_signals,output_secedges=not options.no_secedges)
+
+# TODO: also add PDTB framework rels output to DISRPT outs
+make_disrpt(conn_data,reddit=reddit,corpus=corpus_name.lower(),outmode=options.disrpt_outmode)
+
+sys.__stdout__.write("\no Created DISRPT shared task discourse relation formats in target rst/disrpt/\n")
+
diff --git a/_build/src/const/GUM_academic_census.ptb b/_build/src/const/GUM_academic_census.ptb
@@ -7,7 +7,7 @@
       (NN university)
       (NN faculty))
     (VP
-      (VB play)
+      (VBP play)
       (NP (DT a) (JJ special) (NN role))
       (PP
         (IN in)

diff --git a/_build/src/const/GUM_academic_epistemic.ptb b/_build/src/const/GUM_academic_epistemic.ptb
@@ -638,67 +638,69 @@
                       (PP (IN of) (NP (NN reliability))))))))))))
     (. .)))
 
-(ROOT
-  (S
-    (SBAR
-      (WHADVP (WRB When))
-      (S
-        (NP (DT the) (NN deceit))
-        (VP (VBD was) (VP (VBN uncovered)))))
-    (, ,)
-    (ADVP (RB however))
-    (, ,)
-    (NP (DT the) (NN effect))
-    (VP (VBD was) (NP (RB just) (DT the) (NN opposite)))
-    (: :)))
-
 (ROOT
   (S
     (S
-      (NP
-        (NP (DT the) (NN legitimacy))
-        (PP (IN of) (NP (DT the) (VBN published) (NNS findings))))
-      (VP
-        (VBD was)
-        (RB not)
-        (VP
-          (VBN enhanced)
-          (PP
-            (IN through)
-            (NP
-              (NP (PRP$ their) (NN publication))
-              (PP (IN by) (NP (NNP Elsevier))))))))
-    (, ,)
-    (CC but)
-    (ADVP (RB rather))
+      (SBAR
+        (WHADVP (WRB When))
+        (S
+          (NP (DT the) (NN deceit))
+          (VP (VBD was) (VP (VBN uncovered)))))
+      (, ,)
+      (ADVP (RB however))
+      (, ,)
+      (NP (DT the) (NN effect))
+      (VP (VBD was) (NP (RB just) (DT the) (NN opposite))))
+    (: :)
     (S
-      (NP
-        (NP (DT the) (NN legitimacy))
-        (PP
-          (IN of)
-          (NP
-            (NP (NP (NNP Elsevier) (POS ’s)) (NNS publications))
-            (: —)
-            (CC and)
-            (PRN (, ,) (PP (IN by) (NP (NN extension))) (, ,))
-            (NP (DT all) (JJ academic) (NNS journals))
-            (: —))))
-      (VP
-        (VBD was)
+      (S
+        (NP
+          (NP (DT the) (NN legitimacy))
+          (PP (IN of) (NP (DT the) (VBN published) (NNS findings))))
         (VP
-          (VBN diminished)
+          (VBD was)
+          (RB not)
+          (VP
+            (VBN enhanced)
+            (PP
+              (IN through)
+              (NP
+                (NP (PRP$ their) (NN publication))
+                (PP (IN by) (NP (NNP Elsevier))))))))
+      (, ,)
+      (CC but)
+      (ADVP (RB rather))
+      (S
+        (NP
+          (NP (DT the) (NN legitimacy))
           (PP
-            (IN through)
+            (IN of)
             (NP
-              (NP (PRP$ their) (NN dissemination))
-              (PP
-                (IN of)
-                (NP
-                  (ADJP
-                    (ADJP (JJ deceptive))
-                    (CC and)
-                    (ADJP (RB commercially) (JJ interested)))
-                  (NN research))))))))
+              (NP (NP (NNP Elsevier) (POS ’s)) (NNS publications))
+              (PRN
+                (: —)
+                (CC and)
+                (, ,)
+                (PP (IN by) (NP (NN extension)))
+                (, ,)
+                (NP (DT all) (JJ academic) (NNS journals))
+                (: —)))))
+        (VP
+          (VBD was)
+          (VP
+            (VBN diminished)
+            (PP
+              (IN through)
+              (NP
+                (NP (PRP$ their) (NN dissemination))
+                (PP
+                  (IN of)
+                  (NP
+                    (ADJP
+                      (ADJP (JJ deceptive))
+                      (CC and)
+                      (ADJP (RB commercially) (JJ interested)))
+                    (NN research)))))))))
     (. .)))
 
 (ROOT
@@ -825,7 +827,7 @@
                         (VB pay)
                         (NP (DT a) (NN publication) (NN fee))))))))))))
     (. .)
-    (LS 8)))
+    (CD 8)))
 
 (ROOT
   (S

diff --git a/_build/src/const/GUM_academic_games.ptb b/_build/src/const/GUM_academic_games.ptb
@@ -1020,7 +1020,7 @@
                   (, ,)
                   (CC and)
                   (VP
-                    (VB realize)
+                    (VBP realize)
                     (NP
                       (NP (NNS improvements))
                       (PP

diff --git a/_build/src/const/GUM_academic_theropod.ptb b/_build/src/const/GUM_academic_theropod.ptb
@@ -1325,7 +1325,7 @@
         (NP (DT the) (NNS results))
         (VP (VBD reported) (ADVP (RB here))))
       (VP
-        (VB show)
+        (VBP show)
         (SBAR
           (IN that)
           (S

diff --git a/_build/src/const/GUM_bio_enfant.ptb b/_build/src/const/GUM_bio_enfant.ptb
@@ -573,7 +573,7 @@
           (VP
             (VBG redesigning)
             (NP
-              (NP (DT the) (NN City) (NN Hall))
+              (NP (DT the) (NNP City) (NNP Hall))
               (PP (IN in) (NP (NNP New) (NNP York))))
             (PP
               (IN for)
@@ -632,7 +632,7 @@
           (PP (IN to) (NP (CD 1801)))))
       (ADVP (RB now))
       (VP
-        (VB reside)
+        (VBP reside)
         (PP (IN in) (NP (DT the) (NNP National) (NNPS Archives)))))
     (. .)))