Skip to content

Commit

Permalink
refactor finished, seems to work now
Browse files Browse the repository at this point in the history
  • Loading branch information
brendano committed May 15, 2015
1 parent ae14bf9 commit e1ce442
Show file tree
Hide file tree
Showing 8 changed files with 58 additions and 38 deletions.
7 changes: 4 additions & 3 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,14 @@
set -eux
cd $(dirname $0)/stanford_corenlp_pywrapper

rm -rf _build lib/piperunner.jar
jarfile=lib/corenlpwrapper.jar
rm -rf _build $jarfile
mkdir _build

CORENLP_JAR=/home/sw/corenlp/stanford-corenlp-full-2015-04-20/stanford-corenlp-3.5.2.jar

javac -source 7 -target 7 -d _build -cp "$(print lib/*.jar | tr ' ' ':')":$CORENLP_JAR javasrc/**/*.java
(cd _build && jar cf ../lib/piperunner.jar .)
ls -l lib/piperunner.jar
(cd _build && jar cf ../$jarfile .)
ls -l $jarfile

rm -rf _build
2 changes: 1 addition & 1 deletion proc_doc_lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
mode = sys.argv[1]

import stanford_corenlp_pywrapper.sockwrap as sw
ss = sw.SockWrap(mode)
ss = sw.SockWrap(mode) # need to override corenlp_jars

for line in sys.stdin:
text = line.rstrip("\n").decode('utf8','replace')
Expand Down
2 changes: 1 addition & 1 deletion proc_text_files.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
mode = sys.argv[1]

import stanford_corenlp_pywrapper.sockwrap as sw
ss = sw.SockWrap(mode)
ss = sw.SockWrap(mode) # need to override corenlp_jars

for filename in sys.argv[2:]:
outfile = re.sub(r'\.txt$',"", filename) + ".anno"
Expand Down
4 changes: 2 additions & 2 deletions proc_text_files_to_stdout.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@
python proc_text_files_to_stdout.py pos *.txt > allpos.anno
"""

import sys, re, os, json
import sys, re, os
mode = sys.argv[1]

import stanford_corenlp_pywrapper.sockwrap as sw
ss = sw.SockWrap(mode)
ss = sw.SockWrap(mode) # need to override corenlp_jars

for filename in sys.argv[2:]:
docid = os.path.basename(filename)
Expand Down
43 changes: 30 additions & 13 deletions stanford_corenlp_pywrapper/javasrc/corenlp/JsonPipeline.java
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@
import edu.stanford.nlp.semgraph.SemanticGraphEdge;

/**
* A wrapper around a CoreNLP Pipeline object that knows how to turn output annotations into JSON.
* A wrapper around a CoreNLP Pipeline object that knows how to turn output annotations into JSON,
* with 0-oriented indexing conventions.
*
* TODO: no coref yet, will be an 'entities' key in the document's json object.
*/
Expand All @@ -54,11 +55,6 @@ public class JsonPipeline {
public JsonPipeline() {
}

static enum InputFormat {
DETECT_JSON_VARIANT,
RAW_TEXT
};

static void addTokenBasics(Map<String,Object> sent_info, CoreMap sentence) {
List<List<Integer>> tokenSpans = Lists.newArrayList();
List<String> tokenTexts = Lists.newArrayList();
Expand All @@ -70,6 +66,7 @@ static void addTokenBasics(Map<String,Object> sent_info, CoreMap sentence) {
sent_info.put("tokens", (Object) tokenTexts);
sent_info.put("char_offsets", (Object) tokenSpans);
}

@SuppressWarnings({ "rawtypes", "unchecked" })
static void addTokenAnno(Map<String,Object> sent_info, CoreMap sentence,
String keyname, Class annoClass) {
Expand All @@ -79,21 +76,25 @@ static void addTokenAnno(Map<String,Object> sent_info, CoreMap sentence,
}
sent_info.put(keyname, (Object) tokenAnnos);
}

static void addParseTree(Map<String,Object> sent_info, CoreMap sentence) {
sent_info.put("parse", sentence.get(TreeCoreAnnotations.TreeAnnotation.class).toString());
}

@SuppressWarnings("rawtypes")
static void addDepsCC(Map<String,Object> sent_info, CoreMap sentence) {
SemanticGraph dependencies = sentence.get(CollapsedCCProcessedDependenciesAnnotation.class);
List deps = jsonFriendlyDeps(dependencies);
sent_info.put("deps_cc", deps);
}

@SuppressWarnings("rawtypes")
static void addDepsBasic(Map<String,Object> sent_info, CoreMap sentence) {
SemanticGraph dependencies = sentence.get(BasicDependenciesAnnotation.class);
List deps = jsonFriendlyDeps(dependencies);
sent_info.put("deps_basic", deps);
}

@SuppressWarnings({ "rawtypes", "unchecked" })
static List jsonFriendlyDeps(SemanticGraph dependencies) {
List deps = new ArrayList();
Expand Down Expand Up @@ -135,18 +136,25 @@ void initializeCorenlpPipeline() {
/** annotator is a stanford corenlp notion. */
void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, String annotator) {
switch(annotator) {
case "tokens":
break;
case "tokenize":
case "cleanxml":
break;
case "ssplit":
break;
case "pos":
addTokenAnno(sent_info,sentence, "pos", PartOfSpeechAnnotation.class);
break;
case "lemmas":
case "lemma":
addTokenAnno(sent_info,sentence, "lemmas", LemmaAnnotation.class);
break;
case "ner":
addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
break;
case "regexner":
addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
break;
case "sentiment": throw new RuntimeException("TODO");
case "truecase": throw new RuntimeException("TODO");
case "parse":
addParseTree(sent_info,sentence);
addDepsCC(sent_info,sentence);
Expand All @@ -156,9 +164,14 @@ void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, Str
addDepsCC(sent_info,sentence);
addDepsBasic(sent_info,sentence);
break;
case "ner":
addTokenAnno(sent_info, sentence, "ner", NamedEntityTagAnnotation.class);
addTokenAnno(sent_info, sentence, "normner", NormalizedNamedEntityTagAnnotation.class);
case "dcoref":
// TODO
break;
case "relation": throw new RuntimeException("TODO");
case "natlog": throw new RuntimeException("TODO");
case "quote": throw new RuntimeException("TODO");
case "entitymentions":
// TODO
break;
default:
throw new RuntimeException("don't know how to handle annotator " + annotator);
Expand All @@ -167,8 +180,12 @@ void addAnnoToSentenceObject(Map<String,Object> sent_info, CoreMap sentence, Str

String[] annotators() {
String annotatorsAllstr = (String) props.get("annotators");
if (annotatorsAllstr==null || annotatorsAllstr.trim().isEmpty()) {
throw new RuntimeException("'annotators' property seems to not be set");
}
return annotatorsAllstr.trim().split(",\\s*");
}

/** runs the corenlp pipeline with all options, and returns all results as a JSON object. */
@SuppressWarnings({ "rawtypes", "unchecked" })
JsonNode processTextDocument(String doctext) {
Expand Down
14 changes: 10 additions & 4 deletions stanford_corenlp_pywrapper/javasrc/corenlp/PipeRunner.java
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import util.BasicFileIO;
import util.JsonUtil;
import util.U;
import corenlp.JsonPipeline.InputFormat;

/**
* stdin/stdout commandline pipe mode that lightly wraps JsonPipeline.
*
Expand All @@ -21,6 +21,11 @@
public class PipeRunner {
ProcessingMode mode;
JsonPipeline parse;

static enum InputFormat {
DETECT_JSON_VARIANT,
RAW_TEXT
};

/** the pre-baked processing modes, that define annotators and outputs. */
static enum ProcessingMode {
Expand Down Expand Up @@ -114,16 +119,17 @@ public static void main(String[] args) {
else { throw new RuntimeException("bad flag: " + flag); }
}

PipeRunner runner = new PipeRunner();
// Parse runner = new Parse();

throw new RuntimeException("TODO need to handle mode parsing; in the meantime this is broken");

// PipeRunner runner = new PipeRunner();
// String _mode = args[0];
// ProcessingMode mode = modeFromString(_mode);
// if (runner.mode==null) {
// U.pf("Bad mode '%s' ... to disable a mode, use 'nomode'\n", _mode);
// usage();
// }
runner.runStdinStdout(inputFormat);
// runner.runStdinStdout(inputFormat);
}


Expand Down
Binary file not shown.
24 changes: 10 additions & 14 deletions stanford_corenlp_pywrapper/sockwrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@

COMMAND = """
exec {JAVA} -Xmx{XMX_AMOUNT} -cp '{classpath}'
corenlp.PipeCommandRunner --server {server_port} {more_config}"""
corenlp.SocketServer --server {server_port} {more_config}"""

JAVA = "java"
XMX_AMOUNT = "4g"
Expand All @@ -43,24 +43,27 @@
TEMP_DIR = None


def command(mode=None, configfile=None, configdict=None, output_types=None, **kwargs):
def command(mode=None, configfile=None, configdict=None, **kwargs):
d = {}
d.update(globals())
d.update(**kwargs)

more_config = ""
if mode is None and configfile is None and configdict is None:
assert False, "Need to set mode, or the annotators directly, for this wrapper to work."
if mode:
if configdict is None:
if configdict is not None:
assert 'annotators' not in configdict, "mode was given but annotators are set in the configdict. use only one please."
elif configdict is None:
configdict = {}
LOG.info("mode given as '%s' so setting annotators: %s" % (mode, MODES[mode]['annotators']))
configdict['annotators'] = MODES[mode]['annotators']
if configfile:
more_config += " --configfile {}".format(configfile)
if configdict:
j = json.dumps(configdict)
assert "'" not in j, "can't handle single quote in config values"
more_config += " --configdict '{}'".format(j)
if output_types:
more_config += " --output-types '{}'".format(' '.join(output_types))
d['more_config'] = more_config

return COMMAND.format(**d).replace("\n", " ")
Expand All @@ -73,7 +76,7 @@ class SubprocessCrashed(Exception):
class SockWrap:

def __init__(self, mode=None, server_port=12340,
configfile=None, configdict=None, output_types=None,
configfile=None, configdict=None,
corenlp_jars=(
"/home/sw/corenlp/stanford-corenlp-full-2015-04-20/*",
"/home/sw/stanford-srparser-2014-10-23-models.jar",
Expand Down Expand Up @@ -102,7 +105,6 @@ def __init__(self, mode=None, server_port=12340,
self.server_port = server_port
self.configfile = configfile
self.configdict = configdict
self.output_types = output_types

assert isinstance(corenlp_jars, (list,tuple))

Expand All @@ -112,13 +114,7 @@ def __init__(self, mode=None, server_port=12340,
local_libdir = os.path.join(os.path.abspath(os.path.dirname(__file__)),
'lib')

jars = [os.path.join(local_libdir, "piperunner.jar"),
# for eclipse development only
# "/Users/brendano/myutil/bin",
os.path.join(local_libdir, "guava-13.0.1.jar"),
os.path.join(local_libdir, "jackson-all-1.9.11.jar"),
]

jars = [os.path.join(local_libdir, "*")]
jars += corenlp_jars
self.classpath = ':'.join(jars)

Expand Down

0 comments on commit e1ce442

Please sign in to comment.