Merge pull request #31 from amir-zeldes/dev

V3.2.0
amir-zeldes · Feb 2, 2018 · e39c2d0 · e39c2d0
2 parents e1aaede + b8da1c1
commit e39c2d0
Show file tree

Hide file tree

Showing 2,710 changed files with 1,780,800 additions and 1,765,225 deletions.
diff --git a/LICENSE.txt b/LICENSE.txt
@@ -56,7 +56,7 @@ https://corpling.uis.georgetown.edu/gum/
 
 For scholarly work referencing the corpus, please cite this paper:
 
-Zeldes, Amir (2016) "The GUM Corpus: Creating Multilayer Resources in the Classroom". Language Resources and Evaluation. 
+Zeldes, Amir (2017) "The GUM Corpus: Creating Multilayer Resources in the Classroom". Language Resources and Evaluation 51(3), 581–612. 
 
 The rest of this document lists the full text of the respective licenses.
 

diff --git a/README.md b/README.md
@@ -3,14 +3,23 @@ Repository for the Georgetown University Multilayer Corpus (GUM)
 
 This repository contains release versions of the Georgetown University Multilayer Corpus (GUM), a corpus of English texts from four text types (interviews, news, travel guides, instructional texts). The corpus is created as part of the course LING-367 (Computational Corpus Linguistics) at Georgetown University. For more details see: http://corpling.uis.georgetown.edu/gum.
 
+## Citing
+To cite this corpus, please refer to the following article:
+
+Zeldes, Amir (2017) "The GUM Corpus: Creating Multilayer Resources in the Classroom". Language Resources and Evaluation 51(3), 581–612. 
+
 ## Directories
 The corpus is downloadable in multiple formats. Not all formats contain all annotations. The most complete XML representation is in PAULA XML, and the easiest way to search in the corpus is using ANNIS. Other formats may be useful for other purposes. See website for more details.
 
-  * _build/ - The GUM build bot and utilities for data merging and validation (see https://corpling.uis.georgetown.edu/gum/build.html)
-  * annis/ - The entire merged corpus, with all annotations, as a relANNIS 3.3 corpus dump, importable into ANNIS (see http://corpus-tools.org/annis)
+  * _build/ - The [GUM build bot](https://corpling.uis.georgetown.edu/gum/build.html) and utilities for data merging and validation
+  * annis/ - The entire merged corpus, with all annotations, as a relANNIS 3.3 corpus dump, importable into [ANNIS](http://corpus-tools.org/annis)
   * const/ - Constituent trees and PTB POS tags in the PTB bracketing format (automatic parser output)
-  * coref/ - Entity and coreference annotation in two formats: conll shared task tabular format (with no bridging annotations) and WebAnno .tsv format, including entity and information status annotations, bridging and singleton entities
-  * dep/ - Dependency trees in the conll 10 column format using Stanford Typed Dependencies (manually corrected) and extended PTB POS tags (following TreeTagger/Amalgam, e.g. tags like VVZ), as well as speaker and sentence type annotations
-  * paula/ - The entire merged corpus in PAULA standoff XML, with all annotations (see https://www.sfb632.uni-potsdam.de/en/paula.html for format documentation) 
+  * coref/ - Entity and coreference annotation in two formats: 
+    * conll/ - CoNLL shared task tabular format (with no bridging annotations)
+    * tsv/ - WebAnno .tsv format, including entity and information status annotations, bridging and singleton entities
+  * dep/ - Dependency trees of two kinds:
+    * stanford/ - Original Stanford Typed Dependencies (manually corrected) in the CoNLLX 10 column format with extended PTB POS tags (following TreeTagger/Amalgam, e.g. tags like VVZ), as well as speaker and sentence type annotations
+    * ud/ - Universal Dependencies data, automatically converted from the Stanford Typed Dependency data, enriched with automatic morphological tags and Universal POS tags according to the UD standard
+  * paula/ - The entire merged corpus in standoff [PAULA XML](https://www.sfb632.uni-potsdam.de/en/paula.html), with all annotations
   * rst/ - Rhetorical Structure Theory analyses in .rs3 format as used by RSTTool and rstWeb (spaces between words correspond to tokenization in rest of corpus)
   * xml/ - vertical XML representations with 1 token or tag per line and tab delimited lemmas and POS tags (extended, VVZ style, vanilla and CLAWS5, as well as dependency functions), compatible with the IMS Corpus Workbench (a.k.a. TreeTagger format).
diff --git a/_build/utils/pepper/pepper_out.txt → _build/__init__.py b/_build/utils/pepper/pepper_out.txt → _build/__init__.py
diff --git a/_build/build_gum.py b/_build/build_gum.py
@@ -1,12 +1,18 @@
 #!/usr/bin/python
 # -*- coding: utf-8 -*-
 
-import os, shutil, sys
+import os, shutil, sys, io
 from glob import glob
 from argparse import ArgumentParser
 from utils.pepper_runner import run_pepper
 import datetime
 
+if sys.platform == "win32":  # Print \n new lines in Windows
+	import os, msvcrt
+	msvcrt.setmode(sys.stdout.fileno(), os.O_BINARY)
+
+PY2 = sys.version_info[0] < 3
+
 def setup_directories(gum_source, gum_target):
 	if not os.path.exists(gum_source):
 		raise IOError("Source file directory " + gum_source + " not found.")
@@ -49,16 +55,17 @@ def setup_directories(gum_source, gum_target):
 # validate input for further steps
 from utils.validate import validate_src
 
-print "="*20
-print "Validating files..."
-print "="*20 + "\n"
+print("="*20)
+print("Validating files...")
+print("="*20 + "\n")
 
 validate_src(gum_source)
 
 ######################################
 ## Step 2: propagate annotations
 ######################################
 from utils.propagate import enrich_dep, enrich_xml, const_parse
+from utils.stanford2ud import create_ud
 from utils.repair_tsv import fix_tsv
 from utils.repair_rst import fix_rst
 
@@ -70,17 +77,17 @@ def setup_directories(gum_source, gum_target):
 #   * fresh token strings, POS tags and lemmas from xml/
 #   * generates vanilla tags in CPOS column from POS
 #   * creates speaker and s_type comments from xml/
-print "\nEnriching Dependencies:\n" + "="*23
+print("\nEnriching Dependencies:\n" + "="*23)
 enrich_dep(gum_source, gum_target)
 
 # Add annotations to xml/:
 #   * add CLAWS tags in fourth column
 #   * add fifth column after lemma containing tok_func from dep/
-print "\nEnriching XML files:\n" + "="*23
+print("\nEnriching XML files:\n" + "="*23)
 enrich_xml(gum_source, gum_target, options.claws)
 
 # Token and sentence border adjustments
-print "\nAdjusting token and sentence borders:\n" + "="*40
+print("\nAdjusting token and sentence borders:\n" + "="*37)
 # Adjust tsv/ files:
 #   * refresh and re-merge token strings in case they were mangled by WebAnno
 #   * adjust sentence borders to match xml/ <s>-tags
@@ -94,22 +101,34 @@ def setup_directories(gum_source, gum_target):
 # Create fresh constituent parses in const/ if desired
 # (either reparse or use dep2const conversion, e.g. https://github.com/ikekonglp/PAD)
 if options.parse:
-	print "\nRegenerating constituent trees:\n" + "="*30
+	print("\nRegenerating constituent trees:\n" + "="*30)
 	const_parse(gum_source,gum_target)
 else:
-	print "\ni Skipping fresh parse for const/"
+	sys.stdout.write("\ni Skipping fresh parse for const/\n")
 	if not os.path.exists(gum_target + "const"):
-		print "x const/ directory missing in target but parsing was set to false! Aborting..."
+		sys.stdout.write("x const/ directory missing in target but parsing was set to false! Aborting...\n")
 		sys.exit()
 	elif len(glob(gum_target + "const" + os.sep + "*.ptb")) != len(glob(gum_target + "xml" + os.sep + "*.xml")):
-		print "x parsing was set to false but xml/ and const/ contain different amounts of files! Aborting..."
+		sys.stdout.write("x parsing was set to false but xml/ and const/ contain different amounts of files! Aborting...\n")
 		sys.exit()
 
+# Create Universal Dependencies version
+#   * UD files will be created in <target>/dep/ud/
+#   * UD punctuation guidelines are enforced using udapi, which must be installed to work
+#   * udapi does not support Python 2, meaning punctuation will be attached to the root if using Python 2
+#   * UD morphology generation relies on parses already existing in <target>/const/
+print("\nCreating Universal Dependencies version:\n" + "=" * 40)
+if PY2:
+	print("WARN: Running on Python 2 - consider upgrading to Python 3. ")
+	print("      Punctuation behavior in the UD conversion relies on udapi ")
+	print("      which does not support Python 2. All punctuation will be attached to sentence roots.\n")
+create_ud(gum_target)
+
 ## Step 3: merge and convert source formats to target formats
 if options.no_pepper:
-	print "\ni Skipping Pepper conversion"
+	sys.__stdout__.write("\ni Skipping Pepper conversion\n")
 else:
-	print "\nStarting pepper conversion:\n" + "="*30
+	sys.__stdout__.write("\nStarting pepper conversion:\n" + "="*30 + "\n")
 
 	# Create Pepper staging erea in utils/pepper/tmp/
 	pepper_home = "utils" + os.sep + "pepper" + os.sep
@@ -128,9 +147,9 @@ def setup_directories(gum_source, gum_target):
 	pepper_tmp = pepper_home + "tmp" + os.sep
 
 	try:
-		pepper_params = open("utils" + os.sep + "pepper" + os.sep + "merge_gum.pepperparams").read().replace("\r","")
+		pepper_params = io.open("utils" + os.sep + "pepper" + os.sep + "merge_gum.pepperparams", encoding="utf8").read().replace("\r","")
 	except:
-		print "x Can't find pepper template at: "+"utils" + os.sep + "pepper" + os.sep + "merge_gum.pepperparams"+"\n  Aborting..."
+		sys.__stdout__.write("x Can't find pepper template at: "+"utils" + os.sep + "pepper" + os.sep + "merge_gum.pepperparams"+"\n  Aborting...")
 		sys.exit()
 
 	# Inject gum_target in pepper_params and replace os.sep with URI slash
@@ -140,12 +159,12 @@ def setup_directories(gum_source, gum_target):
 
 	# Setup metadata file
 	build_date = datetime.datetime.now().date().isoformat()
-	meta = open(pepper_home + "meta_template.meta").read().replace("\r","")
+	meta = io.open(pepper_home + "meta_template.meta", encoding="utf8").read().replace("\r","")
 	meta = meta.replace("**gum_version**",options.increment_version)
 	meta = meta.replace("**build_date**",build_date)
-	meta_out = open(pepper_tmp + "xml" + os.sep + "GUM" + os.sep + "GUM.meta",'w')
+	meta_out = io.open(pepper_tmp + "xml" + os.sep + "GUM" + os.sep + "GUM.meta",'w')
 	meta_out.write(meta)
 	meta_out.close()
 
 	out = run_pepper(pepper_params,options.verbose_pepper)
-	print out
+	sys.__stdout__.write(out + "\n")
diff --git a/_build/src/dep/GUM_interview_ants.conll10 b/_build/src/dep/GUM_interview_ants.conll10
@@ -2,7 +2,7 @@
 2	Nick	_	NP	NNP	_	3	nn	_	_
 3	Bos	_	NP	NNP	_	4	nsubj	_	_
 4	tells	_	VVZ	VBZ	_	0	root	_	_
-5	Wikinews	_	NP	NNP	_	4	dobj	_	_
+5	Wikinews	_	NP	NNP	_	4	iobj	_	_
 6	about	_	IN	IN	_	4	prep	_	_
 7	'	_	``	``	_	0	punct	_	_
 8	self-medicating	_	VVG	VBG	_	10	amod	_	_
@@ -11,10 +11,10 @@
 
 1	Tuesday	_	NP	NNP	_	0	root	_	_
 2	,	_	,	,	_	0	punct	_	_
-3	September	_	NP	NNP	_	4	nn	_	_
+3	September	_	NP	NNP	_	4	tmod	_	_
 4	1	_	CD	CD	_	1	appos	_	_
 5	,	_	,	,	_	0	punct	_	_
-6	2015	_	CD	CD	_	3	tmod	_	_
+6	2015	_	CD	CD	_	4	tmod	_	_
 
 1	Formica	_	FW	FW	_	2	nn	_	_
 2	fusca	_	FW	FW	_	0	root	_	_
@@ -192,13 +192,13 @@
 15	their	_	PP$	PRP$	_	16	poss	_	_
 16	intake	_	NN	NN	_	14	dobj	_	_
 17	depending	_	VVG	VBG	_	14	prep	_	_
-18	upon	_	IN	IN	_	17	pcomp	_	_
+18	upon	_	IN	IN	_	17	mwe	_	_
 19	how	_	WRB	WRB	_	20	advmod	_	_
-20	high	_	JJ	JJ	_	24	advmod	_	_
+20	high	_	JJ	JJ	_	17	pcomp	_	_
 21	the	_	DT	DT	_	23	det	_	_
 22	peroxide	_	NN	NN	_	23	nn	_	_
-23	concentration	_	NN	NN	_	24	nsubj	_	_
-24	was	_	VBD	VBD	_	18	pcomp	_	_
+23	concentration	_	NN	NN	_	20	nsubj	_	_
+24	was	_	VBD	VBD	_	20	cop	_	_
 25	.	_	SENT	.	_	0	punct	_	_
 
 1	In	_	IN	IN	_	8	prep	_	_
@@ -431,7 +431,7 @@
 6	the	_	DT	DT	_	7	det	_	_
 7	Centre	_	NP	NNP	_	5	pobj	_	_
 8	for	_	IN	IN	_	7	prep	_	_
-9	Social	_	NP	NNP	_	10	nn	_	_
+9	Social	_	NP	NNP	_	10	amod	_	_
 10	Evolution	_	NP	NNP	_	8	pobj	_	_
 11	at	_	IN	IN	_	7	prep	_	_
 12	the	_	DT	DT	_	13	det	_	_
@@ -835,7 +835,7 @@
 51	when	_	WRB	WRB	_	52	advmod	_	_
 52	given	_	VVN	VBN	_	56	advcl	_	_
 53	the	_	DT	DT	_	54	det	_	_
-54	choice	_	NN	NN	_	52	pobj	_	_
+54	choice	_	NN	NN	_	52	dobj	_	_
 55	,	_	,	,	_	0	punct	_	_
 56	ignore	_	VVP	VBP	_	38	advcl	_	_
 57	that	_	DT	DT	_	58	det	_	_

diff --git a/_build/src/dep/GUM_interview_brotherhood.conll10 b/_build/src/dep/GUM_interview_brotherhood.conll10
@@ -10,10 +10,10 @@
 
 1	Wednesday	_	NP	NNP	_	0	root	_	_
 2	,	_	,	,	_	0	punct	_	_
-3	October	_	NP	NNP	_	4	nn	_	_
+3	October	_	NP	NNP	_	4	tmod	_	_
 4	9	_	CD	CD	_	1	appos	_	_
 5	,	_	,	,	_	0	punct	_	_
-6	2013	_	CD	CD	_	3	tmod	_	_
+6	2013	_	CD	CD	_	4	tmod	_	_
 
 1	October	_	NP	NNP	_	5	nsubj	_	_
 2	is	_	VBZ	VBZ	_	5	cop	_	_
@@ -79,7 +79,7 @@
 4	.	_	SENT	.	_	0	punct	_	_
 
 1	I	_	PP	PRP	_	3	nsubj	_	_
-2	am	_	VBP	VBP	_	3	root	_	_
+2	am	_	VBP	VBP	_	3	cop	_	_
 3	61	_	CD	CD	_	0	root	_	_
 4	now	_	RB	RB	_	3	advmod	_	_
 5	so	_	IN	IN	_	8	mark	_	_
@@ -162,7 +162,7 @@
 22	which	_	WDT	WDT	_	25	dobj	_	_
 23	I	_	PP	PRP	_	25	nsubj	_	_
 24	still	_	RB	RB	_	25	advmod	_	_
-25	own	_	JJ	JJ	_	21	amod	_	_
+25	own	_	VVP	VBP	_	21	rcmod	_	_
 26	today	_	NN	NN	_	25	advmod	_	_
 27	-	_	:	:	_	0	punct	_	_
 28	"	_	``	``	_	0	punct	_	_
@@ -301,7 +301,7 @@
 24	young	_	JJ	JJ	_	25	amod	_	_
 25	magicians	_	NNS	NNS	_	23	pobj	_	_
 26	to	_	TO	TO	_	27	aux	_	_
-27	think	_	VV	VB	_	20	vmod	_	_
+27	think	_	VV	VB	_	22	vmod	_	_
 28	because	_	IN	IN	_	31	mark	_	_
 29	so	_	RB	RB	_	30	advmod	_	_
 30	many	_	JJ	JJ	_	31	nsubj	_	_
@@ -453,7 +453,7 @@
 48	Ring	_	NP	NNP	_	49	nn	_	_
 49	Convention	_	NP	NNP	_	20	conj	_	_
 50	in	_	IN	IN	_	49	prep	_	_
-51	Buxton	_	NP	NNP	_	50	nn	_	_
+51	Buxton	_	NP	NNP	_	50	pobj	_	_
 52	,	_	,	,	_	0	punct	_	_
 53	England	_	NP	NNP	_	51	appos	_	_
 54	and	_	CC	CC	_	20	cc	_	_
@@ -520,11 +520,11 @@
 54	April	_	NP	NNP	_	53	pobj	_	_
 55	.	_	SENT	.	_	0	punct	_	_
 
-1	That	that	DT	_	_	3	nsubj	_	_
-2	's	be	VBZ	_	_	3	cop	_	_
-3	as	as	IN	_	_	0	root	_	_
-4	of	of	IN	_	_	3	pcomp	_	_
-5	now	now	RB	_	_	4	pobj	_	_
+1	That	that	DT	_	_	2	nsubj	_	_
+2	's	be	VBZ	_	_	0	root	_	_
+3	as	as	IN	_	_	2	prep	_	_
+4	of	of	IN	_	_	3	mwe	_	_
+5	now	now	RB	_	_	3	pobj	_	_
 6	.	.	SENT	_	_	0	punct	_	_
 
 1	File	_	NN	NN	_	2	nn	_	_