diff --git a/dkpro-core-io-gigaword-asl/pom.xml b/dkpro-core-io-gigaword-asl/pom.xml
index 680c1abaf0..2aeacd8d66 100644
--- a/dkpro-core-io-gigaword-asl/pom.xml
+++ b/dkpro-core-io-gigaword-asl/pom.xml
@@ -55,6 +55,10 @@
org.dkpro.core
dkpro-core-api-io-asl
+
+ org.dkpro.core
+ dkpro-core-api-ner-asl
+
eu.openminted.share.annotations
omtd-share-annotations-api
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
index b4278a3b38..112661d555 100644
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
@@ -17,13 +17,16 @@
*/
package org.dkpro.core.io.gigaword;
-import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
-import java.io.InputStream;
import java.util.Iterator;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.ResourceMetaData;
@@ -31,9 +34,10 @@
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.ResourceCollectionReaderBase;
import org.dkpro.core.api.parameter.MimeTypes;
-import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle;
-import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordExtractor;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordParser;
+import org.xml.sax.SAXException;
import com.google.common.collect.AbstractIterator;
@@ -72,8 +76,20 @@ public void getNext(CAS aJCas)
DocumentMetaData dmd = DocumentMetaData.get(aJCas);
dmd.setDocumentId(article.getId());
-
- aJCas.setDocumentText(article.getText());
+ try
+ {
+ AnnotatedGigawordParser parser = new AnnotatedGigawordParser();
+ parser.setJCas(aJCas.getJCas());
+ SAXParserFactory factory = SAXParserFactory.newInstance();
+ SAXParser saxParser = factory.newSAXParser();
+ saxParser.parse(new ByteArrayInputStream(article.getText().getBytes()), parser);
+ }
+ catch (CASException e) {
+ throw new CollectionException(e);
+ }
+ catch (SAXException | ParserConfigurationException e) {
+ throw new IOException(e);
+ }
}
@Override
@@ -95,12 +111,8 @@ protected AnnotatedGigawordArticle computeNext()
&& AnnotatedGigawordReader.super.hasNext()
) {
Resource res = nextFile();
- try (InputStream is = new BufferedInputStream(CompressionUtils
- .getInputStream(res.getLocation(), res.getInputStream()))) {
- currentFileIterator = AnnotatedGigawordDocuments
- .fromAnnotatedGigawordFile(res).iterator();
-
- }
+ currentFileIterator = new AnnotatedGigawordExtractor(res).getArticleList()
+ .iterator();
}
}
catch (Exception e) {
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
deleted file mode 100644
index 2498c15cd0..0000000000
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Technische Universität Darmstadt under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The Technische Universität Darmstadt
- * licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.dkpro.core.io.gigaword.internal;
-
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.List;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
-import org.dkpro.core.api.resources.CompressionUtils;
-
-import com.google.common.collect.AbstractIterator;
-
-/**
- * The LDC distributes annotated Gigaword as a moderate number of gzipped files, each of which has
- * many documents concatenated together. This class lets you iterate over the documents stored in
- * such a file.
- */
-public class AnnotatedGigawordDocuments
- implements Iterable
-{
- private List articleList;
-
- private AnnotatedGigawordDocuments(List aArticleList)
- {
- articleList = aArticleList;
- }
-
- public static AnnotatedGigawordDocuments fromAnnotatedGigawordFile(Resource aResource)
- throws Exception
- {
- try (InputStream is = new BufferedInputStream(CompressionUtils
- .getInputStream(aResource.getLocation(), aResource.getInputStream()))) {
- SAXParserFactory factory = SAXParserFactory.newInstance();
- SAXParser saxParser = factory.newSAXParser();
- AnnotatedGigawordParser parser = new AnnotatedGigawordParser(aResource);
- saxParser.parse(is, parser);
- return new AnnotatedGigawordDocuments(parser.getArticleList());
- }
- }
-
- @Override
- public Iterator iterator()
- {
- return new AnnotatedArticlesIterator();
- }
-
- private class AnnotatedArticlesIterator
- extends AbstractIterator
- {
- private int startNextIndex = 0;
-
- @Override
- protected AnnotatedGigawordArticle computeNext()
- {
- if (startNextIndex >= articleList.size()) {
- return endOfData();
- }
- else {
- AnnotatedGigawordArticle nextArticle = articleList.get(startNextIndex);
- startNextIndex++;
- return nextArticle;
- }
- }
- }
-}
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
new file mode 100644
index 0000000000..b378d91c0b
--- /dev/null
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.dkpro.core.io.gigaword.internal;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
+/**
+ * Read text from the Annotated Gigaword Corpus. This reader does not read any of the
+ * annotations yet.
+ */
+public class AnnotatedGigawordExtractor
+{
+ private List articleList = new ArrayList<>();
+
+ public AnnotatedGigawordExtractor(Resource aResource) throws IOException
+ {
+ try (InputStream fileInputStream = aResource.getInputStream();
+ InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
+ BufferedReader br = new BufferedReader(inputStreamReader)) {
+
+ String sCurrentLine;
+
+ Pattern GIGAWORD_DOC_ELEMENT_PATTERN = Pattern.compile("");
+ String currentDocId = "";
+ StringBuilder currentDocText = new StringBuilder();
+
+ boolean inSentences = false;
+
+ // read file
+ while ((sCurrentLine = br.readLine()) != null) {
+
+ if (sCurrentLine.contains("")) {
+ currentDocText.append(sCurrentLine + "\n");
+ // save previous document
+ if (!currentDocText.toString().equals("")) {
+ articleList.add(new AnnotatedGigawordArticle(aResource, currentDocId,
+ currentDocText.toString()));
+ currentDocText = new StringBuilder();
+ }
+ }
+
+ if (sCurrentLine.contains(""))
+ {
+ inSentences = true;
+ }
+
+ // only save information to reduce memory usage
+ if (inSentences) {
+ currentDocText.append(sCurrentLine + "\n");
+ }
+
+ if (sCurrentLine.contains(""))
+ {
+ inSentences = false;
+ }
+ }
+ }
+ }
+
+ public List getArticleList() {
+ return articleList;
+ }
+}
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
index e8c68b7953..ef9a48a1e8 100644
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
@@ -17,41 +17,47 @@
*/
package org.dkpro.core.io.gigaword.internal;
-import java.util.ArrayList;
-import java.util.List;
-
-import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
+import org.apache.uima.jcas.JCas;
import org.xml.sax.Attributes;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+
/**
* Read text from the Annotated Gigaword Corpus. This reader does not read any of the
* annotations yet.
*/
public class AnnotatedGigawordParser extends DefaultHandler
{
- private final Resource resource;
-
- private List articleList = new ArrayList<>();
+ private JCas jcas;
// flags for parsing articles
private boolean inDocument = false;
private boolean inSentences = false;
private boolean inToken = false;
private boolean inWord = false;
+ private boolean inLemma = false;
private boolean inOffsetBegin = false;
+ private boolean inNER = false;
// variables for reconstructing articles
- private StringBuilder docText = new StringBuilder();
+ private StringBuilder currentDocText = new StringBuilder();
private String currentDocId = "";
+ private Token currentToken;
private String currentWord = "";
private int currentOffsetBegin = 0;
- public AnnotatedGigawordParser(Resource aResource)
+ public void setJCas(final JCas aJCas)
{
- super();
- resource = aResource;
+ jcas = aJCas;
+ }
+
+ protected JCas getJCas()
+ {
+ return jcas;
}
@Override
@@ -70,9 +76,15 @@ else if (inSentences && qName.equals("token")) {
else if (inToken && qName.equals("word")) {
inWord = true;
}
+ else if (inToken && qName.equals("lemma")) {
+ inLemma = true;
+ }
else if (inToken && qName.equals("CharacterOffsetBegin")) {
inOffsetBegin = true;
}
+ else if (inToken && qName.equals("NER")) {
+ inNER = true;
+ }
}
@Override
@@ -83,37 +95,55 @@ public void endElement(String uri, String localName, String qName)
}
else if (inDocument && qName.equals("sentences")) {
inSentences = false;
- articleList
- .add(new AnnotatedGigawordArticle(resource, currentDocId, docText.toString()));
- docText = new StringBuilder();
+ jcas.setDocumentText(currentDocText.toString());
+ currentDocText = new StringBuilder();
}
else if (inSentences && qName.equals("token")) {
inToken = false;
- while (docText.length() < currentOffsetBegin) {
- docText.append(" ");
+ while (currentDocText.length() < currentOffsetBegin) {
+ currentDocText.append(" ");
}
- docText.append(currentWord);
+ currentDocText.append(currentWord);
+ currentToken.addToIndexes();
}
else if (inToken && qName.equals("word")) {
inWord = false;
}
+ else if (inToken && qName.equals("lemma")) {
+ inLemma = false;
+ }
else if (inToken && qName.equals("CharacterOffsetBegin")) {
inOffsetBegin = false;
}
+ else if (inToken && qName.equals("NER")) {
+ inNER = false;
+ }
}
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
if (inWord) {
+ currentToken = new Token(getJCas(), start, length);
currentWord = new String(ch, start, length);
}
+ if (inLemma) {
+ String lemma = new String(ch, start, length);
+ Lemma l = new Lemma(getJCas(), currentToken.getBegin(), currentToken.getEnd());
+ l.setValue(lemma);
+ l.addToIndexes();
+ currentToken.setLemma(l);
+ }
if (inOffsetBegin) {
currentOffsetBegin = Integer.parseInt(new String(ch, start, length).trim());
}
-
+ if (inNER) {
+ String namedEntity = new String(ch, start, length);
+ NamedEntity ne = new NamedEntity(jcas);
+ ne.setBegin(start);
+ ne.setEnd(start + length);
+ ne.setValue(namedEntity);
+ ne.addToIndexes();
+ }
}
- public List getArticleList() {
- return articleList;
- }
}