dkpro#1386 - Add further JCas annotations

- use of an "extractor" to split the file into individual documents - further JCas annotations for every document besides documentText
az79nefy · Jun 29, 2019 · 04f2805 · 04f2805
1 parent 55d3c64
commit 04f2805
Show file tree

Hide file tree

Showing 5 changed files with 175 additions and 118 deletions.
diff --git a/dkpro-core-io-gigaword-asl/pom.xml b/dkpro-core-io-gigaword-asl/pom.xml
@@ -55,6 +55,10 @@
       <groupId>org.dkpro.core</groupId>
       <artifactId>dkpro-core-api-io-asl</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.dkpro.core</groupId>
+      <artifactId>dkpro-core-api-ner-asl</artifactId>
+    </dependency>
     <dependency>
       <groupId>eu.openminted.share.annotations</groupId>
       <artifactId>omtd-share-annotations-api</artifactId>

diff --git a/...ore-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java b/...ore-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
@@ -17,23 +17,27 @@
  */
 package org.dkpro.core.io.gigaword;
 
-import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.Iterator;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
 
 import org.apache.uima.UimaContext;
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CollectionException;
 import org.apache.uima.fit.descriptor.MimeTypeCapability;
 import org.apache.uima.fit.descriptor.ResourceMetaData;
 import org.apache.uima.fit.descriptor.TypeCapability;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.dkpro.core.api.io.ResourceCollectionReaderBase;
 import org.dkpro.core.api.parameter.MimeTypes;
-import org.dkpro.core.api.resources.CompressionUtils;
 import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle;
-import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordExtractor;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordParser;
+import org.xml.sax.SAXException;
 
 import com.google.common.collect.AbstractIterator;
 
@@ -72,8 +76,20 @@ public void getNext(CAS aJCas)
 
         DocumentMetaData dmd = DocumentMetaData.get(aJCas);
         dmd.setDocumentId(article.getId());
-
-        aJCas.setDocumentText(article.getText());
+        try
+        {
+            AnnotatedGigawordParser parser = new AnnotatedGigawordParser();
+            parser.setJCas(aJCas.getJCas());
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            SAXParser saxParser = factory.newSAXParser();
+            saxParser.parse(new ByteArrayInputStream(article.getText().getBytes()), parser);
+        }
+        catch (CASException e) {
+            throw new CollectionException(e);
+        }
+        catch (SAXException | ParserConfigurationException e) {
+            throw new IOException(e);
+        }
     }
 
     @Override
@@ -95,12 +111,8 @@ protected AnnotatedGigawordArticle computeNext()
                         && AnnotatedGigawordReader.super.hasNext()
                 ) {
                     Resource res = nextFile();
-                    try (InputStream is = new BufferedInputStream(CompressionUtils
-                            .getInputStream(res.getLocation(), res.getInputStream()))) {
-                        currentFileIterator = AnnotatedGigawordDocuments
-                                .fromAnnotatedGigawordFile(res).iterator();
-
-                    }
+                    currentFileIterator = new AnnotatedGigawordExtractor(res).getArticleList()
+                            .iterator();
                 }
             }
             catch (Exception e) {

diff --git a/...ord-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java b/...ord-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
diff --git a/...ord-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java b/...ord-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.dkpro.core.io.gigaword.internal;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
+/**
+ * Read text from the Annotated Gigaword Corpus. This reader does <b>not</b> read any of the
+ * annotations yet.
+ */
+public class AnnotatedGigawordExtractor
+{
+    private List<AnnotatedGigawordArticle> articleList = new ArrayList<>();
+
+    public AnnotatedGigawordExtractor(Resource aResource) throws IOException
+    {
+        try (InputStream fileInputStream = aResource.getInputStream();
+             InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
+             BufferedReader br = new BufferedReader(inputStreamReader)) {
+
+            String sCurrentLine;
+
+            Pattern GIGAWORD_DOC_ELEMENT_PATTERN = Pattern.compile("<DOC id=\"(.*?)\".*>");
+            String currentDocId = "";
+            StringBuilder currentDocText = new StringBuilder();
+
+            boolean inSentences = false;
+
+            // read file
+            while ((sCurrentLine = br.readLine()) != null) {
+
+                if (sCurrentLine.contains("<DOC id=")) {
+                    currentDocText.append(sCurrentLine + "\n");
+                    // extract new document ID
+                    Matcher m = GIGAWORD_DOC_ELEMENT_PATTERN.matcher(sCurrentLine);
+                    if (m.find()) {
+                        currentDocId = m.group(1);
+                    } else {
+                        throw new RuntimeException("Missing document ID on article");
+                    }
+                }
+                else if (sCurrentLine.contains("</DOC>")) {
+                    currentDocText.append(sCurrentLine + "\n");
+                    // save previous document
+                    if (!currentDocText.toString().equals("")) {
+                        articleList.add(new AnnotatedGigawordArticle(aResource, currentDocId,
+                                currentDocText.toString()));
+                        currentDocText = new StringBuilder();
+                    }
+                }
+
+                if (sCurrentLine.contains("<sentences>"))
+                {
+                    inSentences = true;
+                }
+
+                // only save <sentences> information to reduce memory usage
+                if (inSentences) {
+                    currentDocText.append(sCurrentLine + "\n");
+                }
+
+                if (sCurrentLine.contains("</sentences>"))
+                {
+                    inSentences = false;
+                }
+            }
+        }
+    }
+
+    public List<AnnotatedGigawordArticle> getArticleList() {
+        return articleList;
+    }
+}