Skip to content

Commit

Permalink
dkpro#1386 - Add further JCas annotations
Browse files Browse the repository at this point in the history
- use of an "extractor" to split the file into individual documents
- further JCas annotations for every document besides documentText
  • Loading branch information
az79nefy committed Jun 29, 2019
1 parent 55d3c64 commit 04f2805
Show file tree
Hide file tree
Showing 5 changed files with 175 additions and 118 deletions.
4 changes: 4 additions & 0 deletions dkpro-core-io-gigaword-asl/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-io-asl</artifactId>
</dependency>
<dependency>
<groupId>org.dkpro.core</groupId>
<artifactId>dkpro-core-api-ner-asl</artifactId>
</dependency>
<dependency>
<groupId>eu.openminted.share.annotations</groupId>
<artifactId>omtd-share-annotations-api</artifactId>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,23 +17,27 @@
*/
package org.dkpro.core.io.gigaword;

import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.Iterator;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;

import org.apache.uima.UimaContext;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.fit.descriptor.MimeTypeCapability;
import org.apache.uima.fit.descriptor.ResourceMetaData;
import org.apache.uima.fit.descriptor.TypeCapability;
import org.apache.uima.resource.ResourceInitializationException;
import org.dkpro.core.api.io.ResourceCollectionReaderBase;
import org.dkpro.core.api.parameter.MimeTypes;
import org.dkpro.core.api.resources.CompressionUtils;
import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordArticle;
import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordDocuments;
import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordExtractor;
import org.dkpro.core.io.gigaword.internal.AnnotatedGigawordParser;
import org.xml.sax.SAXException;

import com.google.common.collect.AbstractIterator;

Expand Down Expand Up @@ -72,8 +76,20 @@ public void getNext(CAS aJCas)

DocumentMetaData dmd = DocumentMetaData.get(aJCas);
dmd.setDocumentId(article.getId());

aJCas.setDocumentText(article.getText());
try
{
AnnotatedGigawordParser parser = new AnnotatedGigawordParser();
parser.setJCas(aJCas.getJCas());
SAXParserFactory factory = SAXParserFactory.newInstance();
SAXParser saxParser = factory.newSAXParser();
saxParser.parse(new ByteArrayInputStream(article.getText().getBytes()), parser);
}
catch (CASException e) {
throw new CollectionException(e);
}
catch (SAXException | ParserConfigurationException e) {
throw new IOException(e);
}
}

@Override
Expand All @@ -95,12 +111,8 @@ protected AnnotatedGigawordArticle computeNext()
&& AnnotatedGigawordReader.super.hasNext()
) {
Resource res = nextFile();
try (InputStream is = new BufferedInputStream(CompressionUtils
.getInputStream(res.getLocation(), res.getInputStream()))) {
currentFileIterator = AnnotatedGigawordDocuments
.fromAnnotatedGigawordFile(res).iterator();

}
currentFileIterator = new AnnotatedGigawordExtractor(res).getArticleList()
.iterator();
}
}
catch (Exception e) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
/*
* Licensed to the Technische Universität Darmstadt under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The Technische Universität Darmstadt
* licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License.
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.dkpro.core.io.gigaword.internal;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.dkpro.core.api.io.ResourceCollectionReaderBase.Resource;
/**
* Read text from the Annotated Gigaword Corpus. This reader does <b>not</b> read any of the
* annotations yet.
*/
public class AnnotatedGigawordExtractor
{
private List<AnnotatedGigawordArticle> articleList = new ArrayList<>();

public AnnotatedGigawordExtractor(Resource aResource) throws IOException
{
try (InputStream fileInputStream = aResource.getInputStream();
InputStreamReader inputStreamReader = new InputStreamReader(fileInputStream);
BufferedReader br = new BufferedReader(inputStreamReader)) {

String sCurrentLine;

Pattern GIGAWORD_DOC_ELEMENT_PATTERN = Pattern.compile("<DOC id=\"(.*?)\".*>");
String currentDocId = "";
StringBuilder currentDocText = new StringBuilder();

boolean inSentences = false;

// read file
while ((sCurrentLine = br.readLine()) != null) {

if (sCurrentLine.contains("<DOC id=")) {
currentDocText.append(sCurrentLine + "\n");
// extract new document ID
Matcher m = GIGAWORD_DOC_ELEMENT_PATTERN.matcher(sCurrentLine);
if (m.find()) {
currentDocId = m.group(1);
} else {
throw new RuntimeException("Missing document ID on article");
}
}
else if (sCurrentLine.contains("</DOC>")) {
currentDocText.append(sCurrentLine + "\n");
// save previous document
if (!currentDocText.toString().equals("")) {
articleList.add(new AnnotatedGigawordArticle(aResource, currentDocId,
currentDocText.toString()));
currentDocText = new StringBuilder();
}
}

if (sCurrentLine.contains("<sentences>"))
{
inSentences = true;
}

// only save <sentences> information to reduce memory usage
if (inSentences) {
currentDocText.append(sCurrentLine + "\n");
}

if (sCurrentLine.contains("</sentences>"))
{
inSentences = false;
}
}
}
}

public List<AnnotatedGigawordArticle> getArticleList() {
return articleList;
}
}
Loading

0 comments on commit 04f2805

Please sign in to comment.