From 04f2805ffef3c2d629aeee74a1a9023c87819984 Mon Sep 17 00:00:00 2001 From: az79nefy Date: Sat, 29 Jun 2019 17:21:26 +0200 Subject: [PATCH] #1386 - Add further JCas annotations - use of an "extractor" to split the file into individual documents - further JCas annotations for every document besides documentText --- dkpro-core-io-gigaword-asl/pom.xml | 4 + .../io/gigaword/AnnotatedGigawordReader.java | 36 ++++--- .../internal/AnnotatedGigawordDocuments.java | 85 ---------------- .../internal/AnnotatedGigawordExtractor.java | 96 +++++++++++++++++++ .../internal/AnnotatedGigawordParser.java | 72 ++++++++++---- 5 files changed, 175 insertions(+), 118 deletions(-) delete mode 100644 dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java create mode 100644 dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java diff --git a/dkpro-core-io-gigaword-asl/pom.xml b/dkpro-core-io-gigaword-asl/pom.xml index 680c1abaf0..2aeacd8d66 100644 --- a/dkpro-core-io-gigaword-asl/pom.xml +++ b/dkpro-core-io-gigaword-asl/pom.xml @@ -55,6 +55,10 @@ org.dkpro.core dkpro-core-api-io-asl + + org.dkpro.core + dkpro-core-api-ner-asl + eu.openminted.share.annotations omtd-share-annotations-api diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java index b4278a3b38..112661d555 100644 --- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java @@ -17,13 +17,16 @@ */ package org.dkpro.core.io.gigaword; -import java.io.BufferedInputStream; +import java.io.ByteArrayInputStream; import java.io.IOException; -import java.io.InputStream; import java.util.Iterator; +import javax.xml.parsers.ParserConfigurationException; +import javax.xml.parsers.SAXParser; +import javax.xml.parsers.SAXParserFactory; import org.apache.uima.UimaContext; import org.apache.uima.cas.CAS; +import org.apache.uima.cas.CASException; import org.apache.uima.collection.CollectionException; import org.apache.uima.fit.descriptor.MimeTypeCapability; import org.apache.uima.fit.descriptor.ResourceMetaData; @@ -31,9 +34,10 @@ import org.apache.uima.resource.ResourceInitializationException; import org.dkpro.core.api.io.ResourceCollectionReaderBase; import org.dkpro.core.api.parameter.MimeTypes; -import org.dkpro.core.api.resources.CompressionUtils; import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle; -import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments; +import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordExtractor; +import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordParser; +import org.xml.sax.SAXException; import com.google.common.collect.AbstractIterator; @@ -72,8 +76,20 @@ public void getNext(CAS aJCas) DocumentMetaData dmd = DocumentMetaData.get(aJCas); dmd.setDocumentId(article.getId()); - - aJCas.setDocumentText(article.getText()); + try + { + AnnotatedGigawordParser parser = new AnnotatedGigawordParser(); + parser.setJCas(aJCas.getJCas()); + SAXParserFactory factory = SAXParserFactory.newInstance(); + SAXParser saxParser = factory.newSAXParser(); + saxParser.parse(new ByteArrayInputStream(article.getText().getBytes()), parser); + } + catch (CASException e) { + throw new CollectionException(e); + } + catch (SAXException | ParserConfigurationException e) { + throw new IOException(e); + } } @Override @@ -95,12 +111,8 @@ protected AnnotatedGigawordArticle computeNext() && AnnotatedGigawordReader.super.hasNext() ) { Resource res = nextFile(); - try (InputStream is = new BufferedInputStream(CompressionUtils - .getInputStream(res.getLocation(), res.getInputStream()))) { - currentFileIterator = AnnotatedGigawordDocuments - .fromAnnotatedGigawordFile(res).iterator(); - - } + currentFileIterator = new AnnotatedGigawordExtractor(res).getArticleList() + .iterator(); } } catch (Exception e) { diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java deleted file mode 100644 index 2498c15cd0..0000000000 --- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Licensed to the Technische Universität Darmstadt under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The Technische Universität Darmstadt - * licenses this file to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.dkpro.core.io.gigaword.internal; - -import java.io.BufferedInputStream; -import java.io.InputStream; -import java.util.Iterator; -import java.util.List; - -import javax.xml.parsers.SAXParser; -import javax.xml.parsers.SAXParserFactory; - -import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; -import org.dkpro.core.api.resources.CompressionUtils; - -import com.google.common.collect.AbstractIterator; - -/** - * The LDC distributes annotated Gigaword as a moderate number of gzipped files, each of which has - * many documents concatenated together. This class lets you iterate over the documents stored in - * such a file. - */ -public class AnnotatedGigawordDocuments - implements Iterable -{ - private List articleList; - - private AnnotatedGigawordDocuments(List aArticleList) - { - articleList = aArticleList; - } - - public static AnnotatedGigawordDocuments fromAnnotatedGigawordFile(Resource aResource) - throws Exception - { - try (InputStream is = new BufferedInputStream(CompressionUtils - .getInputStream(aResource.getLocation(), aResource.getInputStream()))) { - SAXParserFactory factory = SAXParserFactory.newInstance(); - SAXParser saxParser = factory.newSAXParser(); - AnnotatedGigawordParser parser = new AnnotatedGigawordParser(aResource); - saxParser.parse(is, parser); - return new AnnotatedGigawordDocuments(parser.getArticleList()); - } - } - - @Override - public Iterator iterator() - { - return new AnnotatedArticlesIterator(); - } - - private class AnnotatedArticlesIterator - extends AbstractIterator - { - private int startNextIndex = 0; - - @Override - protected AnnotatedGigawordArticle computeNext() - { - if (startNextIndex >= articleList.size()) { - return endOfData(); - } - else { - AnnotatedGigawordArticle nextArticle = articleList.get(startNextIndex); - startNextIndex++; - return nextArticle; - } - } - } -} diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java new file mode 100644 index 0000000000..b378d91c0b --- /dev/null +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java @@ -0,0 +1,96 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.core.io.gigaword.internal; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; +/** + * Read text from the Annotated Gigaword Corpus. This reader does not read any of the + * annotations yet. + */ +public class AnnotatedGigawordExtractor +{ + private List articleList = new ArrayList<>(); + + public AnnotatedGigawordExtractor(Resource aResource) throws IOException + { + try (InputStream fileInputStream = aResource.getInputStream(); + InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream); + BufferedReader br = new BufferedReader(inputStreamReader)) { + + String sCurrentLine; + + Pattern GIGAWORD_DOC_ELEMENT_PATTERN = Pattern.compile(""); + String currentDocId = ""; + StringBuilder currentDocText = new StringBuilder(); + + boolean inSentences = false; + + // read file + while ((sCurrentLine = br.readLine()) != null) { + + if (sCurrentLine.contains("")) { + currentDocText.append(sCurrentLine + "\n"); + // save previous document + if (!currentDocText.toString().equals("")) { + articleList.add(new AnnotatedGigawordArticle(aResource, currentDocId, + currentDocText.toString())); + currentDocText = new StringBuilder(); + } + } + + if (sCurrentLine.contains("")) + { + inSentences = true; + } + + // only save information to reduce memory usage + if (inSentences) { + currentDocText.append(sCurrentLine + "\n"); + } + + if (sCurrentLine.contains("")) + { + inSentences = false; + } + } + } + } + + public List getArticleList() { + return articleList; + } +} diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java index e8c68b7953..ef9a48a1e8 100644 --- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java +++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java @@ -17,41 +17,47 @@ */ package org.dkpro.core.io.gigaword.internal; -import java.util.ArrayList; -import java.util.List; - -import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource; +import org.apache.uima.jcas.JCas; import org.xml.sax.Attributes; import org.xml.sax.SAXException; import org.xml.sax.helpers.DefaultHandler; +import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma; +import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token; + /** * Read text from the Annotated Gigaword Corpus. This reader does not read any of the * annotations yet. */ public class AnnotatedGigawordParser extends DefaultHandler { - private final Resource resource; - - private List articleList = new ArrayList<>(); + private JCas jcas; // flags for parsing articles private boolean inDocument = false; private boolean inSentences = false; private boolean inToken = false; private boolean inWord = false; + private boolean inLemma = false; private boolean inOffsetBegin = false; + private boolean inNER = false; // variables for reconstructing articles - private StringBuilder docText = new StringBuilder(); + private StringBuilder currentDocText = new StringBuilder(); private String currentDocId = ""; + private Token currentToken; private String currentWord = ""; private int currentOffsetBegin = 0; - public AnnotatedGigawordParser(Resource aResource) + public void setJCas(final JCas aJCas) { - super(); - resource = aResource; + jcas = aJCas; + } + + protected JCas getJCas() + { + return jcas; } @Override @@ -70,9 +76,15 @@ else if (inSentences && qName.equals("token")) { else if (inToken && qName.equals("word")) { inWord = true; } + else if (inToken && qName.equals("lemma")) { + inLemma = true; + } else if (inToken && qName.equals("CharacterOffsetBegin")) { inOffsetBegin = true; } + else if (inToken && qName.equals("NER")) { + inNER = true; + } } @Override @@ -83,37 +95,55 @@ public void endElement(String uri, String localName, String qName) } else if (inDocument && qName.equals("sentences")) { inSentences = false; - articleList - .add(new AnnotatedGigawordArticle(resource, currentDocId, docText.toString())); - docText = new StringBuilder(); + jcas.setDocumentText(currentDocText.toString()); + currentDocText = new StringBuilder(); } else if (inSentences && qName.equals("token")) { inToken = false; - while (docText.length() < currentOffsetBegin) { - docText.append(" "); + while (currentDocText.length() < currentOffsetBegin) { + currentDocText.append(" "); } - docText.append(currentWord); + currentDocText.append(currentWord); + currentToken.addToIndexes(); } else if (inToken && qName.equals("word")) { inWord = false; } + else if (inToken && qName.equals("lemma")) { + inLemma = false; + } else if (inToken && qName.equals("CharacterOffsetBegin")) { inOffsetBegin = false; } + else if (inToken && qName.equals("NER")) { + inNER = false; + } } @Override public void characters(char[] ch, int start, int length) throws SAXException { if (inWord) { + currentToken = new Token(getJCas(), start, length); currentWord = new String(ch, start, length); } + if (inLemma) { + String lemma = new String(ch, start, length); + Lemma l = new Lemma(getJCas(), currentToken.getBegin(), currentToken.getEnd()); + l.setValue(lemma); + l.addToIndexes(); + currentToken.setLemma(l); + } if (inOffsetBegin) { currentOffsetBegin = Integer.parseInt(new String(ch, start, length).trim()); } - + if (inNER) { + String namedEntity = new String(ch, start, length); + NamedEntity ne = new NamedEntity(jcas); + ne.setBegin(start); + ne.setEnd(start + length); + ne.setValue(namedEntity); + ne.addToIndexes(); + } } - public List getArticleList() { - return articleList; - } }