From 04f2805ffef3c2d629aeee74a1a9023c87819984 Mon Sep 17 00:00:00 2001
From: az79nefy <alexander.zap@stud.tu-darmstadt.de>
Date: Sat, 29 Jun 2019 17:21:26 +0200
Subject: [PATCH] #1386 - Add further JCas annotations

- use of an "extractor" to split the file into individual documents
- further JCas annotations for every document besides documentText
---
 dkpro-core-io-gigaword-asl/pom.xml            |  4 +
 .../io/gigaword/AnnotatedGigawordReader.java  | 36 ++++---
 .../internal/AnnotatedGigawordDocuments.java  | 85 ----------------
 .../internal/AnnotatedGigawordExtractor.java  | 96 +++++++++++++++++++
 .../internal/AnnotatedGigawordParser.java     | 72 ++++++++++----
 5 files changed, 175 insertions(+), 118 deletions(-)
 delete mode 100644 dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
 create mode 100644 dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
diff --git a/dkpro-core-io-gigaword-asl/pom.xml b/dkpro-core-io-gigaword-asl/pom.xml
index 680c1abaf0..2aeacd8d66 100644
--- a/dkpro-core-io-gigaword-asl/pom.xml
+++ b/dkpro-core-io-gigaword-asl/pom.xml
@@ -55,6 +55,10 @@
       <groupId>org.dkpro.core</groupId>
       <artifactId>dkpro-core-api-io-asl</artifactId>
     </dependency>
+    <dependency>
+      <groupId>org.dkpro.core</groupId>
+      <artifactId>dkpro-core-api-ner-asl</artifactId>
+    </dependency>
     <dependency>
       <groupId>eu.openminted.share.annotations</groupId>
       <artifactId>omtd-share-annotations-api</artifactId>
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
index b4278a3b38..112661d555 100644
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/AnnotatedGigawordReader.java
@@ -17,13 +17,16 @@
  */
 package org.dkpro.core.io.gigaword;
 
-import java.io.BufferedInputStream;
+import java.io.ByteArrayInputStream;
 import java.io.IOException;
-import java.io.InputStream;
 import java.util.Iterator;
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
 
 import org.apache.uima.UimaContext;
 import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CollectionException;
 import org.apache.uima.fit.descriptor.MimeTypeCapability;
 import org.apache.uima.fit.descriptor.ResourceMetaData;
@@ -31,9 +34,10 @@
 import org.apache.uima.resource.ResourceInitializationException;
 import org.dkpro.core.api.io.ResourceCollectionReaderBase;
 import org.dkpro.core.api.parameter.MimeTypes;
-import org.dkpro.core.api.resources.CompressionUtils;
 import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle;
-import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordExtractor;
+import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordParser;
+import org.xml.sax.SAXException;
 
 import com.google.common.collect.AbstractIterator;
 
@@ -72,8 +76,20 @@ public void getNext(CAS aJCas)
 
         DocumentMetaData dmd = DocumentMetaData.get(aJCas);
         dmd.setDocumentId(article.getId());
-        
-        aJCas.setDocumentText(article.getText());
+        try
+        {
+            AnnotatedGigawordParser parser = new AnnotatedGigawordParser();
+            parser.setJCas(aJCas.getJCas());
+            SAXParserFactory factory = SAXParserFactory.newInstance();
+            SAXParser saxParser = factory.newSAXParser();
+            saxParser.parse(new ByteArrayInputStream(article.getText().getBytes()), parser);
+        }
+        catch (CASException e) {
+            throw new CollectionException(e);
+        }
+        catch (SAXException | ParserConfigurationException e) {
+            throw new IOException(e);
+        }
     }
     
     @Override
@@ -95,12 +111,8 @@ protected AnnotatedGigawordArticle computeNext()
                         && AnnotatedGigawordReader.super.hasNext()
                 ) {
                     Resource res = nextFile();
-                    try (InputStream is = new BufferedInputStream(CompressionUtils
-                            .getInputStream(res.getLocation(), res.getInputStream()))) {
-                        currentFileIterator = AnnotatedGigawordDocuments
-                                .fromAnnotatedGigawordFile(res).iterator();
-
-                    }
+                    currentFileIterator = new AnnotatedGigawordExtractor(res).getArticleList()
+                            .iterator();
                 }
             }
             catch (Exception e) {
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
deleted file mode 100644
index 2498c15cd0..0000000000
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordDocuments.java
+++ /dev/null
@@ -1,85 +0,0 @@
-/*
- * Licensed to the Technische Universität Darmstadt under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The Technische Universität Darmstadt 
- * licenses this file to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.
- *  
- * http://www.apache.org/licenses/LICENSE-2.0
- * 
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.dkpro.core.io.gigaword.internal;
-
-import java.io.BufferedInputStream;
-import java.io.InputStream;
-import java.util.Iterator;
-import java.util.List;
-
-import javax.xml.parsers.SAXParser;
-import javax.xml.parsers.SAXParserFactory;
-
-import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
-import org.dkpro.core.api.resources.CompressionUtils;
-
-import com.google.common.collect.AbstractIterator;
-
-/**
- * The LDC distributes annotated Gigaword as a moderate number of gzipped files, each of which has
- * many documents concatenated together. This class lets you iterate over the documents stored in
- * such a file.
- */
-public class AnnotatedGigawordDocuments
-    implements Iterable<AnnotatedGigawordArticle>
-{
-    private List<AnnotatedGigawordArticle> articleList;
-
-    private AnnotatedGigawordDocuments(List<AnnotatedGigawordArticle> aArticleList)
-    {
-        articleList = aArticleList;
-    }
-
-    public static AnnotatedGigawordDocuments fromAnnotatedGigawordFile(Resource aResource)
-        throws Exception
-    {
-        try (InputStream is = new BufferedInputStream(CompressionUtils
-                .getInputStream(aResource.getLocation(), aResource.getInputStream()))) {
-            SAXParserFactory factory = SAXParserFactory.newInstance();
-            SAXParser saxParser = factory.newSAXParser();
-            AnnotatedGigawordParser parser = new AnnotatedGigawordParser(aResource);
-            saxParser.parse(is, parser);
-            return new AnnotatedGigawordDocuments(parser.getArticleList());
-        }
-    }
-
-    @Override
-    public Iterator<AnnotatedGigawordArticle> iterator()
-    {
-        return new AnnotatedArticlesIterator();
-    }
-
-    private class AnnotatedArticlesIterator
-        extends AbstractIterator<AnnotatedGigawordArticle>
-    {
-        private int startNextIndex = 0;
-
-        @Override
-        protected AnnotatedGigawordArticle computeNext()
-        {
-            if (startNextIndex >= articleList.size()) {
-                return endOfData();
-            }
-            else {
-                AnnotatedGigawordArticle nextArticle = articleList.get(startNextIndex);
-                startNextIndex++;
-                return nextArticle;
-            }
-        }
-    }
-}
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
new file mode 100644
index 0000000000..b378d91c0b
--- /dev/null
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordExtractor.java
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Technische Universität Darmstadt under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The Technische Universität Darmstadt
+ * licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.dkpro.core.io.gigaword.internal;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.util.ArrayList;
+import java.util.List;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
+/**
+ * Read text from the Annotated Gigaword Corpus. This reader does <b>not</b> read any of the
+ * annotations yet.
+ */
+public class AnnotatedGigawordExtractor
+{
+    private List<AnnotatedGigawordArticle> articleList = new ArrayList<>();
+    
+    public AnnotatedGigawordExtractor(Resource aResource) throws IOException
+    {
+        try (InputStream fileInputStream = aResource.getInputStream();
+             InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
+             BufferedReader br = new BufferedReader(inputStreamReader)) {
+    
+            String sCurrentLine;
+            
+            Pattern GIGAWORD_DOC_ELEMENT_PATTERN = Pattern.compile("<DOC id=\"(.*?)\".*>");
+            String currentDocId = "";
+            StringBuilder currentDocText = new StringBuilder();
+            
+            boolean inSentences = false;
+            
+            // read file
+            while ((sCurrentLine = br.readLine()) != null) {
+    
+                if (sCurrentLine.contains("<DOC id=")) {
+                    currentDocText.append(sCurrentLine + "\n");
+                    // extract new document ID
+                    Matcher m = GIGAWORD_DOC_ELEMENT_PATTERN.matcher(sCurrentLine);
+                    if (m.find()) {
+                        currentDocId = m.group(1);
+                    } else {
+                        throw new RuntimeException("Missing document ID on article");
+                    }
+                }
+                else if (sCurrentLine.contains("</DOC>")) {
+                    currentDocText.append(sCurrentLine + "\n");
+                    // save previous document
+                    if (!currentDocText.toString().equals("")) {
+                        articleList.add(new AnnotatedGigawordArticle(aResource, currentDocId,
+                                currentDocText.toString()));
+                        currentDocText = new StringBuilder();
+                    }
+                }
+                
+                if (sCurrentLine.contains("<sentences>"))
+                {
+                    inSentences = true;
+                }
+                
+                // only save <sentences> information to reduce memory usage
+                if (inSentences) {
+                    currentDocText.append(sCurrentLine + "\n");
+                }
+                
+                if (sCurrentLine.contains("</sentences>"))
+                {
+                    inSentences = false;
+                }
+            }
+        }
+    }
+    
+    public List<AnnotatedGigawordArticle> getArticleList() {
+        return articleList;
+    }
+}
diff --git a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
index e8c68b7953..ef9a48a1e8 100644
--- a/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
+++ b/dkpro-core-io-gigaword-asl/src/main/java/org/dkpro/core/io/gigaword/internal/AnnotatedGigawordParser.java
@@ -17,41 +17,47 @@
  */
 package org.dkpro.core.io.gigaword.internal;
 
-import java.util.ArrayList;
-import java.util.List;
-
-import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
+import org.apache.uima.jcas.JCas;
 import org.xml.sax.Attributes;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import de.tudarmstadt.ukp.dkpro.core.api.ner.type.NamedEntity;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma;
+import de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Token;
+
 /**
  * Read text from the Annotated Gigaword Corpus. This reader does <b>not</b> read any of the
  * annotations yet.
  */
 public class AnnotatedGigawordParser extends DefaultHandler
 {
-    private final Resource resource;
-    
-    private List<AnnotatedGigawordArticle> articleList = new ArrayList<>();
+    private JCas jcas;
     
     // flags for parsing articles
     private boolean inDocument = false;
     private boolean inSentences = false;
     private boolean inToken = false;
     private boolean inWord = false;
+    private boolean inLemma = false;
     private boolean inOffsetBegin = false;
+    private boolean inNER = false;
     
     // variables for reconstructing articles
-    private StringBuilder docText = new StringBuilder();
+    private StringBuilder currentDocText = new StringBuilder();
     private String currentDocId = "";
+    private Token currentToken;
     private String currentWord = "";
     private int currentOffsetBegin = 0;
     
-    public AnnotatedGigawordParser(Resource aResource)
+    public void setJCas(final JCas aJCas)
     {
-        super();
-        resource = aResource;
+        jcas = aJCas;
+    }
+    
+    protected JCas getJCas()
+    {
+        return jcas;
     }
 
     @Override
@@ -70,9 +76,15 @@ else if (inSentences && qName.equals("token")) {
         else if (inToken && qName.equals("word")) {
             inWord = true;
         }
+        else if (inToken && qName.equals("lemma")) {
+            inLemma = true;
+        }
         else if (inToken && qName.equals("CharacterOffsetBegin")) {
             inOffsetBegin = true;
         }
+        else if (inToken && qName.equals("NER")) {
+            inNER = true;
+        }
     }
     
     @Override
@@ -83,37 +95,55 @@ public void endElement(String uri, String localName, String qName)
         }
         else if (inDocument && qName.equals("sentences")) {
             inSentences = false;
-            articleList
-                    .add(new AnnotatedGigawordArticle(resource, currentDocId, docText.toString()));
-            docText = new StringBuilder();
+            jcas.setDocumentText(currentDocText.toString());
+            currentDocText = new StringBuilder();
         }
         else if (inSentences && qName.equals("token")) {
             inToken = false;
-            while (docText.length() < currentOffsetBegin) {
-                docText.append(" ");
+            while (currentDocText.length() < currentOffsetBegin) {
+                currentDocText.append(" ");
             }
-            docText.append(currentWord);
+            currentDocText.append(currentWord);
+            currentToken.addToIndexes();
         }
         else if (inToken && qName.equals("word")) {
             inWord = false;
         }
+        else if (inToken && qName.equals("lemma")) {
+            inLemma = false;
+        }
         else if (inToken && qName.equals("CharacterOffsetBegin")) {
             inOffsetBegin = false;
         }
+        else if (inToken && qName.equals("NER")) {
+            inNER = false;
+        }
     }
     
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
         if (inWord) {
+            currentToken = new Token(getJCas(), start, length);
             currentWord = new String(ch, start, length);
         }
+        if (inLemma) {
+            String lemma = new String(ch, start, length);
+            Lemma l = new Lemma(getJCas(), currentToken.getBegin(), currentToken.getEnd());
+            l.setValue(lemma);
+            l.addToIndexes();
+            currentToken.setLemma(l);
+        }
         if (inOffsetBegin) {
             currentOffsetBegin = Integer.parseInt(new String(ch, start, length).trim());
         }
-        
+        if (inNER) {
+            String namedEntity = new String(ch, start, length);
+            NamedEntity ne = new NamedEntity(jcas);
+            ne.setBegin(start);
+            ne.setEnd(start + length);
+            ne.setValue(namedEntity);
+            ne.addToIndexes();
+        }
     }
     
-    public List<AnnotatedGigawordArticle> getArticleList() {
-        return articleList;
-    }
 }