From c6df11f57f6650e473bc9116ea68d011e7a126f4 Mon Sep 17 00:00:00 2001 From: Richard Eckart de Castilho Date: Mon, 22 Jul 2024 10:50:57 -0300 Subject: [PATCH] #36 - Add support for TextGamma - Port TextGamma over into DKPro Statistics --- dkpro-statistics-agreement/pom.xml | 19 + .../aligning/AlignableAnnotationUnit.java | 328 ++++++++++ .../aligning/AligningAnnotationStudy.java | 160 +++++ .../aligning/IAlignableAnnotationUnit.java | 60 ++ .../aligning/IAligningAgreementMeasure.java | 34 + .../aligning/IAligningAnnotationStudy.java | 27 + .../aligning/TextAligningAnnotationStudy.java | 61 ++ .../aligning/TextGammaAgreement.java | 285 ++++++++ .../aligning/alignment/Alignment.java | 106 +++ .../aligning/alignment/ITextAlignment.java | 31 + .../alignment/PairwiseDPTextAlignment.java | 301 +++++++++ .../aligning/alignment/UnitaryAlignment.java | 243 +++++++ .../data/AlignableAnnotationTextUnit.java | 132 ++++ .../aligning/data/AnnotatedText.java | 63 ++ .../aligning/data/AnnotatedTextMerge.java | 191 ++++++ .../aligning/data/AnnotationSet.java | 155 +++++ .../agreement/aligning/data/Rater.java | 79 +++ .../aligning/disorder/IDisorderSampler.java | 22 + .../disorder/IDisorderSamplerFactory.java | 26 + .../disorder/SimpleDisorderSampler.java | 218 +++++++ .../dissimilarity/IDissimilarity.java | 23 + .../NominalFeatureDissimilarity.java | 93 +++ .../NominalFeatureTextDissimilarity.java | 56 ++ .../shuffling/AnnotationSetShuffle.java | 613 ++++++++++++++++++ .../shuffling/SegmentationChangeType.java | 21 + .../aligning/shuffling/TextChangeType.java | 21 + .../agreement/AnnotationStudyTest.java | 6 +- .../aligning/TextGammaAgreementTest.java | 253 ++++++++ .../aligning/alignment/AlignmentTest.java | 208 ++++++ .../NominalFeatureDissimilarityTest.java | 58 ++ .../NominalFeatureTextDissimilarityTest.java | 66 ++ .../PairwiseDPTextAlignmentTest.java | 241 +++++++ .../alignment/UnitaryAlignmentTest.java | 236 +++++++ .../aligning/data/AnnotatedTextMergeTest.java | 128 ++++ .../aligning/data/AnnotationSetTest.java | 150 +++++ .../agreement/aligning/data/TextUnitTest.java | 40 ++ .../agreement/aligning/data/UnitTest.java | 145 +++++ .../test/resources/simplelogger.properties | 34 + pom.xml | 2 +- 39 files changed, 4931 insertions(+), 4 deletions(-) create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AlignableAnnotationUnit.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AligningAnnotationStudy.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAlignableAnnotationUnit.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAgreementMeasure.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAnnotationStudy.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextAligningAnnotationStudy.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreement.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/Alignment.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/ITextAlignment.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignment.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignment.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AlignableAnnotationTextUnit.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedText.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMerge.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSet.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/Rater.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSampler.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSamplerFactory.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/SimpleDisorderSampler.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/IDissimilarity.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureDissimilarity.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureTextDissimilarity.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/AnnotationSetShuffle.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/SegmentationChangeType.java create mode 100644 dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/TextChangeType.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreementTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/AlignmentTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureDissimilarityTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureTextDissimilarityTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignmentTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignmentTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMergeTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSetTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/TextUnitTest.java create mode 100644 dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/UnitTest.java create mode 100644 dkpro-statistics-agreement/src/test/resources/simplelogger.properties diff --git a/dkpro-statistics-agreement/pom.xml b/dkpro-statistics-agreement/pom.xml index 4ae5ccd..5ce7ea3 100644 --- a/dkpro-statistics-agreement/pom.xml +++ b/dkpro-statistics-agreement/pom.xml @@ -43,15 +43,34 @@ org.slf4j slf4j-api + + org.apache.commons + commons-math3 + + + org.apache.commons + commons-lang3 + + org.junit.jupiter junit-jupiter-api test + + org.junit.jupiter + junit-jupiter-params + test + org.assertj assertj-core test + + org.slf4j + slf4j-simple + test + diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AlignableAnnotationUnit.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AlignableAnnotationUnit.java new file mode 100644 index 0000000..93cccae --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AlignableAnnotationUnit.java @@ -0,0 +1,328 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning; + +import static java.util.Collections.sort; +import static java.util.Collections.unmodifiableMap; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.data.Rater; + +public class AlignableAnnotationUnit + implements IAlignableAnnotationUnit +{ + private static final long serialVersionUID = 8646000607502108382L; + + private final Rater rater; + private final String type; + private final Map features = new HashMap(); + + private final long begin; + private final long end; + + public AlignableAnnotationUnit(Rater creator, int beg, int end) + { + this(creator, null, beg, end, null); + } + + public AlignableAnnotationUnit(Rater creator, int beg, int end, Map featureset) + { + this(creator, null, beg, end, featureset); + } + + public AlignableAnnotationUnit(Rater aCreator, long aBegin, long aEnd, + Map aFeatures) + { + this(aCreator, NO_TYPE, aBegin, aEnd, null); + } + + public AlignableAnnotationUnit(Rater aRater, String aType, long aBegin, long aEnd, + Map aFeatures) + { + if (aRater == null) { + rater = new Rater("", -1); + } + else { + rater = aRater; + } + + if (aType == null) { + type = NO_TYPE; + } + else { + type = aType; + } + + if (aBegin >= aEnd) { + throw new IllegalArgumentException("Begin has to be smaller than end."); + } + + begin = aBegin; + end = aEnd; + + if (aFeatures != null) { + features.putAll(aFeatures); + } + } + + public Rater getRater() + { + return rater; + } + + public String getType() + { + return type; + } + + @Override + public Object getCategory() + { + return getFeatures(); + } + + @Override + public long getBegin() + { + return begin; + } + + @Override + public long getEnd() + { + return end; + } + + @Override + public int getRaterIdx() + { + return rater.getIndex(); + } + + public Set getFeatureNames() + { + return features.keySet(); + } + + public String getFeatureValue(String attribute) + { + return features.get(attribute); + } + + public Map getFeatures() + { + return unmodifiableMap(features); + } + + public boolean isCoextensive(AlignableAnnotationUnit aOther) + { + return (getBegin() == aOther.getBegin()) && (getEnd() == aOther.getEnd()); + } + + public boolean overlaps(AlignableAnnotationUnit aOther) + { + return !(aOther.getEnd() <= getBegin() || aOther.getBegin() >= getEnd()); + } + + @Override + public boolean equals(Object o) + { + if (o == null) { + return false; + } + if (getClass() != o.getClass()) { + return false; + } + + AlignableAnnotationUnit seg = (AlignableAnnotationUnit) o; + // same type? + if (!Objects.equals(getType(), seg.getType())) { + return false; + } + + // same creator? + if (!Objects.equals(getRater(), seg.getRater())) { + return false; + } + + // same span? + if (!this.isCoextensive(seg)) { + return false; + } + + // same attributes? + if (!this.getFeatureNames().equals(seg.getFeatureNames())) { + return false; + } + // same attribute values? + for (String attribute : this.getFeatureNames()) { + if (!Objects.equals(this.getFeatureValue(attribute), seg.getFeatureValue(attribute))) { + return false; + } + } + + return true; + } + + @Override + public int compareTo(IAlignableAnnotationUnit aOther) + { + if (aOther == null) { + return -1; + } + + if (this.equals(aOther)) { + return 0; + } + + // first: start offset + if (this.getBegin() < aOther.getBegin()) { + return -1; + } + if (this.getBegin() > aOther.getBegin()) { + return 1; + } + + // second: end offset + if (this.getEnd() < aOther.getEnd()) { + return -1; + } + if (this.getEnd() > aOther.getEnd()) { + return 1; + } + + if (!(aOther instanceof AlignableAnnotationUnit)) { + return -1; + } + + AlignableAnnotationUnit other = (AlignableAnnotationUnit) aOther; + + // sort by Type + if (!Objects.equals(getType(), other.getType())) { + if (getType() != null && other.getType() != null) { + return getType().compareTo(other.getType()); + } + else if (getType() == null) { + return -1; + } + else { + return 1; + } + } + + // sort by Creator + if (!Objects.equals(getRater(), other.getRater())) { + if (this.getRater() != null && other.getRater() != null) { + return this.getRater().getName().compareTo(other.getRater().getName()); + } + else if (this.getRater() == null) { + return -1; + } + else { + return 1; + } + } + + // sort by number of attributes + if (this.getFeatureNames().size() != other.getFeatureNames().size()) { + return Integer.compare(this.getFeatureNames().size(), other.getFeatureNames().size()); + } + + // sort by attributes names + List attributelistX = new ArrayList(this.getFeatureNames()); + sort(attributelistX); + + List attributelistY = new ArrayList(other.getFeatureNames()); + sort(attributelistY); + + if (!attributelistX.equals(attributelistY)) { + return String.join("", attributelistX).compareTo(String.join("", attributelistY)); + } + + // sort by attribute values (in order of names) + for (int i = 0; i < attributelistX.size(); i++) { + String attr = attributelistX.get(i); + if (!this.getFeatureValue(attr).equals(other.getFeatureValue(attr))) { + return this.getFeatureValue(attr).compareTo(other.getFeatureValue(attr)); + } + } + + // annotations are equal + // (but one is instantiation of subclass) + return 0; + + } + + public AlignableAnnotationUnit cloneWithDifferentLabel(String aType, String aLabel) + { + var feat = new HashMap<>(getFeatures()); + feat.put(aType, aLabel); + return new AlignableAnnotationUnit(rater, aType, begin, end, feat); + } + + public AlignableAnnotationUnit cloneWithDifferentOffsets(long aBegin, long aEnd) + { + return new AlignableAnnotationUnit(rater, type, aBegin, aEnd, features); + + } + + public AlignableAnnotationUnit cloneWithDifferentRater(Rater aRater) + { + return new AlignableAnnotationUnit(aRater, type, begin, end, features); + } + + @Override + public int hashCode() + { + int hash = 0; + hash += getRater().hashCode(); + hash += getType().hashCode(); + hash += features.hashCode(); + hash += getBegin() + getEnd(); + + return hash; + } + + @Override + public String toString() + { + return this.toString(new ArrayList()); + } + + public String toString(List attributes) + { + var ret = new StringBuilder(); + ret.append(String.valueOf(getBegin())); + ret.append("-"); + ret.append(String.valueOf(getEnd())); + for (String attribute : attributes) { + ret.append("\t"); + if (this.getFeatureValue(attribute) != null) { + ret.append(this.getFeatureValue(attribute)); + } + else { + ret.append("--"); + } + } + return ret.toString(); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AligningAnnotationStudy.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AligningAnnotationStudy.java new file mode 100644 index 0000000..e1c022c --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/AligningAnnotationStudy.java @@ -0,0 +1,160 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning; + +import static java.util.Arrays.asList; +import static java.util.Collections.unmodifiableCollection; +import static java.util.Collections.unmodifiableList; +import static java.util.Collections.unmodifiableSet; + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.dkpro.statistics.agreement.aligning.data.Rater; + +/** + * a set of units created by a given set of annotators; all units are linked to the same (implicit) + * continuum + */ +public class AligningAnnotationStudy + implements IAligningAnnotationStudy +{ + private static final long serialVersionUID = 1892754946013211747L; + + private final Set raters = new TreeSet(); + private final Set units = new TreeSet(); + private final Set categories = new HashSet(); + + private List unitsListCache = null; + + public AligningAnnotationStudy() + { + // Nothing to do + } + + public AligningAnnotationStudy(Collection aUnits) + { + for (var unit : aUnits) { + addUnit(unit); + } + } + + public void addUnits(Collection aUnits) { + for (var unit : aUnits) { + addUnit(unit); + } + } + + public void addUnits(AlignableAnnotationUnit... aUnits) { + for (var unit : aUnits) { + addUnit(unit); + } + } + + public void addUnit(AlignableAnnotationUnit unit) + { + categories.add(unit.getCategory()); + + raters.add(unit.getRater()); + units.add(unit); + + unitsListCache = null; + } + + @Override + public int getRaterCount() + { + return raters.size(); + } + + @Override + public int getUnitCount() + { + return units.size(); + } + + public double getAverageNumberOfAnnotations() + { + return getUnitCount() / ((double) getRaterCount()); + } + + public Set getRaters() + { + return unmodifiableSet(raters); + } + + public List getUnits() + { + if (unitsListCache == null) { + unitsListCache = unmodifiableList( + asList(units.toArray(AlignableAnnotationUnit[]::new))); + } + + return unitsListCache; + } + + @Override + public int findRater(String aName) + { + return raters.stream() // + .filter(a -> aName.equals(a.getName())) // + .mapToInt(a -> a.getIndex()) // + .findFirst().orElse(-1); + } + + @Override + public Iterable getCategories() + { + return unmodifiableCollection(categories); + } + + @Override + public int getCategoryCount() + { + return categories.size(); + } + + @Override + public boolean equals(Object o) + { + if (o == null) { + return false; + } + if (getClass() != o.getClass()) { + return false; + } + + AligningAnnotationStudy comp = (AligningAnnotationStudy) o; + boolean no_diff = true; + + var unitArray = getUnits().toArray(AlignableAnnotationUnit[]::new); + var otherUnitArray = comp.getUnits().toArray(AlignableAnnotationUnit[]::new); + for (int i = 0; i < unitArray.length; i++) { + no_diff = no_diff && unitArray[i].equals(otherUnitArray[i]); + } + + return no_diff; + } + + @Override + public int hashCode() + { + return units.hashCode() + raters.hashCode(); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAlignableAnnotationUnit.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAlignableAnnotationUnit.java new file mode 100644 index 0000000..3bb0a50 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAlignableAnnotationUnit.java @@ -0,0 +1,60 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.statistics.agreement.aligning; + +import org.dkpro.statistics.agreement.IAnnotationUnit; + +/** + * Extension of the {@link IAnnotationUnit} interface for representing the annotation units of + * {@link IAligningAnnotationStudy}s. That is, an annotation unit that models the position of the + * unit within the continuum of an annotation study and the type and attributes assigned to this + * unit by a certain rater. Implement this interface when measuring inter-rater agreement using a + * {@link IAligningAgreementMeasure}. + * + * @see IAnnotationUnit + * @see IAligningAgreementMeasure + * @see IAligningAnnotationStudy + * @author Christian M. Meyer + * @author Richard Eckart de Castilho + */ +public interface IAlignableAnnotationUnit + extends IAnnotationUnit, Comparable +{ + static final String NO_TYPE = ""; + + /** + * @return the length of the annotation unit (i.e., the difference between the end and start + * position of the identified segment). + */ + default long getLength() + { + return getEnd() - getBegin(); + } + + /** + * @return the offset of the annotation unit (i.e., the start position of the identified + * segment). + */ + long getBegin(); + + /** + * @return the right delimiter of the annotation unit (i.e., the end position of the identified + * segment). + */ + long getEnd(); +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAgreementMeasure.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAgreementMeasure.java new file mode 100644 index 0000000..b0f3b45 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAgreementMeasure.java @@ -0,0 +1,34 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.statistics.agreement.aligning; + +import org.dkpro.statistics.agreement.IAgreementMeasure; + +/** + * Super interface for all {@link IAgreementMeasure}s for aligning tasks. That is, a measure of + * inter-rater agreement for {@link IAligningAnnotationStudy}s. + * + * @see IAgreementMeasure + * @see IAligningAnnotationStudy + * @author Richard Eckart de Castilho + */ +public interface IAligningAgreementMeasure + extends IAgreementMeasure +{ + +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAnnotationStudy.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAnnotationStudy.java new file mode 100644 index 0000000..59eb4c5 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/IAligningAnnotationStudy.java @@ -0,0 +1,27 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.statistics.agreement.aligning; + +import org.dkpro.statistics.agreement.IAnnotationStudy; + +public interface IAligningAnnotationStudy + extends IAnnotationStudy + +{ + +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextAligningAnnotationStudy.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextAligningAnnotationStudy.java new file mode 100644 index 0000000..20fc9ce --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextAligningAnnotationStudy.java @@ -0,0 +1,61 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning; + +import java.util.Collection; +import java.util.List; + +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; + +public class TextAligningAnnotationStudy + extends AligningAnnotationStudy +{ + private static final long serialVersionUID = 1013284003769257354L; + + private final String text; + + public TextAligningAnnotationStudy(String aText) + { + text = aText; + } + + public TextAligningAnnotationStudy(String aText, Collection aUnits) + { + super(aUnits); + text = aText; + } + + @Override + public void addUnit(AlignableAnnotationUnit aUnit) + { + if (!(aUnit instanceof AlignableAnnotationTextUnit)) { + throw new IllegalArgumentException("AnnotatedText only accepts AlignableAnnotationTextUnit"); + } + + super.addUnit(aUnit); + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getTextUnits() + { + return (List) (List) super.getUnits(); + } + + public String getText() + { + return text; + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreement.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreement.java new file mode 100644 index 0000000..ecf9f5d --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreement.java @@ -0,0 +1,285 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning; + +import static java.lang.Math.pow; +import static java.lang.Math.round; +import static java.util.Arrays.asList; +import static org.dkpro.statistics.agreement.aligning.data.AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.List; + +import org.apache.commons.math3.distribution.NormalDistribution; +import org.apache.commons.math3.stat.descriptive.moment.Mean; +import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation; +import org.dkpro.statistics.agreement.DisagreementMeasure; +import org.dkpro.statistics.agreement.aligning.data.AnnotatedText; +import org.dkpro.statistics.agreement.aligning.disorder.IDisorderSampler; +import org.dkpro.statistics.agreement.aligning.disorder.IDisorderSamplerFactory; +import org.dkpro.statistics.agreement.aligning.dissimilarity.IDissimilarity; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureTextDissimilarity; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Implementation of the Text-Gamma-measure for calculating a chance-corrected inter-rater agreement + * for aligning studies with two (maybe multiple) raters. + * + * References: + *
    + *
  • Barteld, F., Schröder, I., Zinsmeister, H.: tγ – Inter-annotator agreement for categorization + * with simultaneous segmentation and transcription-error correction. In Proceedings of the 13th + * Conference on Natural Language Processing (KONVENS 2016) pp. 27-37, 2016.
  • + *
+ * + * @author Fabian Barteld (original) + * @author Richard Eckart de Castilho (adaptation into DKPro Statistics) + */ +public class TextGammaAgreement + extends DisagreementMeasure + implements IAligningAgreementMeasure +{ + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + public static final char OPEN_UNIT = '\uFDD0'; + public static final char CLOSE_UNIT = '\uFDD1'; + public static final char GAP = '\uFDD2'; + + static { + checkConstants(); + } + + private final AnnotatedText text1; + private final AnnotatedText text2; + private final IDissimilarity dissimilarity; + private final IDisorderSampler sampler; + private final double precision; + private final double alpha; + + private TextGammaAgreement(Builder builder) + { + dissimilarity = builder.dissimilarity; + precision = builder.precision; + alpha = builder.alpha; + + if (builder.texts != null && builder.study != null) { + throw new IllegalArgumentException( + "Either texts or a study must be given but not both"); + } + + if (builder.texts == null && builder.study == null) { + throw new IllegalArgumentException("Either texts or a study must be given"); + } + + if (builder.texts != null) { + if (builder.texts.size() != 2) { + throw new IllegalArgumentException("Exactly two texts must be compared"); + } + + if (builder.texts.get(0).getRaterCount() != 1 + || builder.texts.get(1).getRaterCount() != 1) { + throw new IllegalArgumentException("Each text must have exactly one rater"); + } + + text1 = builder.texts.get(0); + text2 = builder.texts.get(1); + } + else { + if (builder.study.getRaters().size() != 2) { + throw new IllegalArgumentException("The study must contain exactly two raters"); + } + + var texts = new ArrayList(); + for (var rater : builder.study.getRaters()) { + var units = builder.study.getTextUnits().stream() // + .filter(u -> rater.equals(u.getRater())) // + .toList(); + texts.add(new AnnotatedText(builder.study.getText(), units)); + } + + text1 = texts.get(0); + text2 = texts.get(1); + } + + if (builder.sampler != null) { + sampler = builder.sampler; + } + else if (builder.samplerFactory != null) { + sampler = builder.samplerFactory.create(this); + } + else { + throw new IllegalArgumentException( + "Either a disorder sampler or sampler factory must be given"); + } + } + + public IDissimilarity getDissimilarity() + { + return dissimilarity; + } + + public List getTexts() + { + return asList(text1, text2); + } + + @Override + public double calculateObservedDisagreement() + { + return getObservedDisorder(text1, text2, dissimilarity); + } + + @Override + public double calculateExpectedDisagreement() + { + return calculateExpectedDisagreement(sampler, alpha, precision); + } + + static double calculateExpectedDisagreement(IDisorderSampler aDisorderSampler, double aAlpha, + double aPrecision) + { + var n = 30l; + + var m = new Mean(); + var v = new StandardDeviation(true); + + var sn = new NormalDistribution(0, 1); + + while (m.getN() < n) { + while (m.getN() < n) { + var disorder = aDisorderSampler.sampleDisorder(); + m.increment(disorder); + v.increment(disorder); + } + + // re-estimate n + double meanDisorder = m.getResult(); + + double sdDisorder = v.getResult(); + double cv = sdDisorder / meanDisorder; + n = round(pow(cv * sn.inverseCumulativeProbability(1 - (aAlpha / 2)) / aPrecision, 2)); + } + + return m.getResult(); + } + + public static double getObservedDisorder(AnnotatedText aText1, AnnotatedText aText2, + IDissimilarity aDissiilarity) + { + var minDisorder = Double.MAX_VALUE; + + for (var al : mergeAnnotatedTextsWithSegmentation(aText1, aText2)) { + var disorder = al.getDisorder(aDissiilarity); + if (disorder < minDisorder) { + minDisorder = disorder; + } + } + + return minDisorder; + } + + public static Builder builder() + { + return new Builder(); + } + + public static final class Builder + { + private TextAligningAnnotationStudy study; + private List texts; + private IDissimilarity dissimilarity = new NominalFeatureTextDissimilarity(); + private IDisorderSampler sampler; + private IDisorderSamplerFactory samplerFactory; + private double precision = 0.01; + private double alpha = 0.05; + + private Builder() + { + } + + public Builder withStudy(TextAligningAnnotationStudy aStudy) + { + study = aStudy; + texts = null; + return this; + } + + public Builder withTexts(AnnotatedText... aTexts) + { + texts = asList(aTexts); + study = null; + return this; + } + + public Builder withDissimilarity(IDissimilarity aDissimilarity) + { + dissimilarity = aDissimilarity; + return this; + } + + public Builder withDisorderSampler(IDisorderSampler aSampler) + { + sampler = aSampler; + samplerFactory = null; + return this; + } + + public Builder withDisorderSampler(IDisorderSamplerFactory aSampler) + { + samplerFactory = aSampler; + sampler = null; + return this; + } + + public Builder withPrecision(double aPrecision) + { + precision = aPrecision; + return this; + } + + public Builder withAlpha(double aAlpha) + { + alpha = aAlpha; + return this; + } + + public TextGammaAgreement build() + { + return new TextGammaAgreement(this); + } + } + + @SuppressWarnings("unused") + private static void checkConstants() + { + // assure that the characters denoting beginning and end of units and gaps differ from each + // other + if (OPEN_UNIT == CLOSE_UNIT) { + throw new IllegalArgumentException( + "Characters denoting the start and the end of a unit must differ."); + } + if (OPEN_UNIT == GAP) { + throw new IllegalArgumentException( + "Characters denoting the start of a unit and a gap must differ."); + } + if (CLOSE_UNIT == GAP) { + throw new IllegalArgumentException( + "Characters denoting the end of a unit and a gap must differ."); + } + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/Alignment.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/Alignment.java new file mode 100644 index 0000000..ee22688 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/Alignment.java @@ -0,0 +1,106 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static java.lang.String.join; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotationSet; +import org.dkpro.statistics.agreement.aligning.dissimilarity.IDissimilarity; + +public class Alignment +{ + private final Set alignments; + private final AnnotationSet annotationSet; + + public Alignment(Set aAlignments, AnnotationSet aAnnotationSet) + { + if (aAnnotationSet.getRaterCount() == 1) { + throw new IllegalArgumentException("An Alignment needs to have at least 2 annotators."); + } + + var raters = aAnnotationSet.getRaters(); + + var units = new HashSet(); + + for (var alignment : aAlignments) { + if (!raters.equals(alignment.getRaters())) { + throw new IllegalArgumentException( + "Not all unitary alignments have the same set of annotors."); + } + + for (var creator : raters) { + var unit = alignment.getUnit(creator); + if (unit != null) { + if (units.contains(unit)) { + throw new IllegalArgumentException( + "A unit is contained twice in the unitary alignments."); + } + else if (!aAnnotationSet.contains(unit)) { + throw new IllegalArgumentException( + "A unit is contained the unitary alignments but not in the annotation set."); + } + else { + units.add(unit); + } + } + } + } + + if (!units.containsAll(aAnnotationSet.getUnits())) { + throw new IllegalArgumentException( + "Not all units from the set are contained in the unitary alignments."); + } + + alignments = aAlignments; + annotationSet = aAnnotationSet; + } + + public double getDisorder(IDissimilarity d) + { + double disorder = 0; + + for (var alignment : alignments) { + disorder += alignment.getDisorder(d); + } + + return disorder / annotationSet.getAverageNumberOfAnnotations(); + } + + @Override + public String toString() + { + return toString(new ArrayList()); + } + + public String toString(List attributes) + { + var units_this = new ArrayList( + new TreeSet(this.alignments)); + var uas = new ArrayList(units_this.size()); + for (var ua : units_this) { + uas.add(ua.toString(attributes)); + } + + return join("\n", uas); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/ITextAlignment.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/ITextAlignment.java new file mode 100644 index 0000000..ecfef64 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/ITextAlignment.java @@ -0,0 +1,31 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import java.util.List; + +public interface ITextAlignment +{ + List getAlignments(); + + int getInsertions(); + + int getDeletions(); + + int getSubstitutions(); + + int getLength(); +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignment.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignment.java new file mode 100644 index 0000000..c2d6948 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignment.java @@ -0,0 +1,301 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.CLOSE_UNIT; +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.GAP; +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.OPEN_UNIT; + +import java.util.LinkedList; +import java.util.List; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.data.AnnotatedText; + +/** + * Align the annotations of two texts using dynamic programming. + */ +public class PairwiseDPTextAlignment + implements ITextAlignment +{ + private final char[] textA; + private final char[] textB; + private final int wGap; + + private final char alignChar; + private final char openUnit; + private final Set metaChars; + + private int[][] alignmentMatrix; + + private List alignments = null; + + public PairwiseDPTextAlignment(AnnotatedText aText1, + AnnotatedText aText2) + { + this(insertAnnotationsInText(aText1).toCharArray(), + insertAnnotationsInText(aText2).toCharArray(), GAP, OPEN_UNIT, CLOSE_UNIT); + } + + public PairwiseDPTextAlignment(char[] textA, char[] textB, char alignChar, char openUnit, + char closeUnit) + { + this(textA, textB, alignChar, openUnit, closeUnit, 1); + } + + public PairwiseDPTextAlignment(char[] aTextA, char[] aTextB, char aAlignChar, char aOpenUnit, + char aCloseUnit, int aWGap) + { + textA = aTextA; + textB = aTextB; + + alignChar = aAlignChar; + + openUnit = aOpenUnit; + metaChars = Set.of(aOpenUnit, aCloseUnit); + + wGap = aWGap; + } + + private void fillMatrix() + { + alignmentMatrix = new int[textA.length + 1][textB.length + 1]; + + // initialize first row and column + for (int i = 0; i <= textA.length; i++) { + alignmentMatrix[i][0] = i * wGap; + } + for (int j = 0; j <= textB.length; j++) { + alignmentMatrix[0][j] = j * wGap; + } + + for (int i = 1; i <= textA.length; i++) { + for (int j = 1; j <= textB.length; j++) { + int costAlign = alignmentMatrix[i - 1][j - 1] + weight(i, j); + int costGapB = alignmentMatrix[i][j - 1] + wGap; + int costGapA = alignmentMatrix[i - 1][j] + wGap; + alignmentMatrix[i][j] = Math.min(Math.min(costAlign, costGapB), costGapA); + } + } + } + + private void backtrack_step(int i, int j, String aAlignmentSeqA, String aAlignmentSeqB, + List aAlignments) + { + if (i > 0 || j > 0) { + // if one meta-char + if ((i > 0 && metaChars.contains(textA[i - 1])) + || (j > 0 && metaChars.contains(textB[j - 1]))) { + // if both opening: align + if (i > 0 && j > 0 + && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j - 1] + weight(i, j) + && openUnit == textA[i - 1]) { + backtrack_step(i - 1, j - 1, aAlignmentSeqA + textA[i - 1], + aAlignmentSeqB + textB[j - 1], aAlignments); + } + // if only one is opening: align the other + else if (i > 0 && j > 0 && openUnit == textA[i - 1] + && alignmentMatrix[i][j] == alignmentMatrix[i][j - 1] + wGap) { + backtrack_step(i, j - 1, aAlignmentSeqA + alignChar, + aAlignmentSeqB + textB[j - 1], aAlignments); + } + else if (i > 0 && j > 0 && openUnit == textB[j - 1] + && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j] + wGap) { + backtrack_step(i - 1, j, aAlignmentSeqA + textA[i - 1], + aAlignmentSeqB + alignChar, aAlignments); + } + // else - try all possible alignments + else { + if (i > 0 && j > 0 && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j - 1] + + weight(i, j)) { + backtrack_step(i - 1, j - 1, aAlignmentSeqA + textA[i - 1], + aAlignmentSeqB + textB[j - 1], aAlignments); + } + if (j > 0 && alignmentMatrix[i][j] == alignmentMatrix[i][j - 1] + wGap) { + backtrack_step(i, j - 1, aAlignmentSeqA + alignChar, + aAlignmentSeqB + textB[j - 1], aAlignments); + } + if (i > 0 && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j] + wGap) { + backtrack_step(i - 1, j, aAlignmentSeqA + textA[i - 1], + aAlignmentSeqB + alignChar, aAlignments); + } + } + } + // otherwise: align if possible, or add gap in first sequence or in second sequence + else if (i > 0 && j > 0 + && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j - 1] + weight(i, j)) { + backtrack_step(i - 1, j - 1, aAlignmentSeqA + textA[i - 1], + aAlignmentSeqB + textB[j - 1], aAlignments); + } + else if (j > 0 && alignmentMatrix[i][j] == alignmentMatrix[i][j - 1] + wGap) { + backtrack_step(i, j - 1, aAlignmentSeqA + alignChar, aAlignmentSeqB + textB[j - 1], + aAlignments); + } + else if (i > 0 && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j] + wGap) { + backtrack_step(i - 1, j, aAlignmentSeqA + textA[i - 1], aAlignmentSeqB + alignChar, + aAlignments); + } + + } + else { + String[] ret = { new StringBuilder(aAlignmentSeqA).reverse().toString(), + new StringBuilder(aAlignmentSeqB).reverse().toString() }; + aAlignments.add(ret); + } + } + + private void backtrack() + { + alignments = new LinkedList(); + backtrack_step(textA.length, textB.length, "", "", alignments); + } + + private int weight(int i, int j) + { + if (textA[i - 1] == textB[j - 1]) { + return 0; + } + else { + // only allow gaps! + return wGap * 2 + 1; + } + } + + private void computeAlignments() + { + fillMatrix(); + backtrack(); + } + + @Override + public List getAlignments() + { + if (alignments == null) { + this.computeAlignments(); + } + return alignments; + } + + @Override + public int getInsertions() + { + if (alignments == null) { + this.computeAlignments(); + } + + int i = textA.length; + int j = textB.length; + + int ins = 0; + + while (i > 0 || j > 0) { + if (i > 0 && j > 0 + && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j - 1] + weight(i, j)) { + i = i - 1; + j = j - 1; + } + else if (j > 0 && alignmentMatrix[i][j] == alignmentMatrix[i][j - 1] + wGap) { + j = j - 1; + ins = ins + 1; + } + else if (i > 0 && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j] + wGap) { + i = i - 1; + } + } + return ins; + + } + + @Override + public int getDeletions() + { + if (alignments == null) { + this.computeAlignments(); + } + int i = textA.length; + int j = textB.length; + + int del = 0; + + while (i > 0 || j > 0) { + if (i > 0 && j > 0 + && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j - 1] + weight(i, j)) { + i = i - 1; + j = j - 1; + } + else if (j > 0 && alignmentMatrix[i][j] == alignmentMatrix[i][j - 1] + wGap) { + j = j - 1; + } + else if (i > 0 && alignmentMatrix[i][j] == alignmentMatrix[i - 1][j] + wGap) { + i = i - 1; + del = del + 1; + } + } + return del; + } + + @Override + public int getSubstitutions() + { + // the cost are set such that subs are not allowed! + return 0; + } + + @Override + public int getLength() + { + if (alignments == null) { + this.computeAlignments(); + } + return alignments.get(0)[0].length(); + } + + private static String insertAnnotationsInText(AnnotatedText aText) + { + // assure that the characters denoting beginning and end of units and gaps do not appear in + // the text + if (aText.getText().indexOf(OPEN_UNIT) != -1) { + throw new IllegalArgumentException( + "The character denoting the start of a unit may not appear in the text."); + } + else if (aText.getText().indexOf(CLOSE_UNIT) != -1) { + throw new IllegalArgumentException( + "The character denoting the end of a unit may not appear in the text."); + } + else if (aText.getText().indexOf(GAP) != -1) { + throw new IllegalArgumentException( + "The character denoting a gap may not appear in the text."); + } + + var text = new StringBuilder(aText.getText()); + long pos = 0; + long offset = 0; + + for (var annot : aText.getUnits()) { + if (annot.getBegin() + offset < pos) { + // units overlap + throw new IllegalArgumentException("Text contains overlapping units."); + } + + text.insert((int) (annot.getBegin() + offset), OPEN_UNIT); + offset = offset + 2; + pos = annot.getEnd() + offset; + text.insert((int) (pos - 1), CLOSE_UNIT); + } + + return text.toString(); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignment.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignment.java new file mode 100644 index 0000000..15d7fbf --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignment.java @@ -0,0 +1,243 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static java.lang.Long.MAX_VALUE; +import static java.lang.Long.MIN_VALUE; + +import java.util.ArrayList; +import java.util.Collection; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +import org.apache.commons.math3.util.CombinatoricsUtils; +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.dissimilarity.IDissimilarity; + +public class UnitaryAlignment + implements Comparable +{ + private final long begin; + private final long end; + + private final Map units; + + public UnitaryAlignment(Collection aUnits, Set aAnnotators) + { + var b = MAX_VALUE; + var e = MIN_VALUE; + + units = new HashMap(); + for (var unit : aUnits) { + if (units.containsKey(unit.getRater())) { + throw new IllegalArgumentException( + "Unitary alignment may not contain two units from the same creator"); + } + if (!aAnnotators.contains(unit.getRater())) { + throw new IllegalArgumentException( + "Unitary alignment may only contain units created by annotators from the given set."); + } + + if (unit.getBegin() < b) { + b = unit.getBegin(); + } + if (unit.getEnd() > e) { + e = unit.getEnd(); + } + + units.put(unit.getRater(), unit); + } + + // add empty elements + for (Rater annot : aAnnotators) { + if (!units.containsKey(annot)) { + units.put(annot, null); + } + } + + begin = b; + end = e; + } + + public int arity() + { + return units.size(); + } + + public long getBegin() + { + return this.begin; + } + + public long getEnd() + { + return this.end; + } + + public Set getRaters() + { + return this.units.keySet(); + } + + public AlignableAnnotationUnit getUnit(Rater creator) + { + return units.get(creator); + } + + public double getDisorder(IDissimilarity d) + { + double dissim = 0; + + AlignableAnnotationUnit[] annots = units.values().toArray(new AlignableAnnotationUnit[0]); + + for (int i = 0; i < annots.length; i++) { + for (int j = i + 1; j < annots.length; j++) { + dissim += d.dissimilarity(annots[i], annots[j]); + } + } + + return dissim / (double) CombinatoricsUtils.binomialCoefficient(this.arity(), 2); + + } + + @Override + public int hashCode() + { + return this.units.hashCode(); + } + + @Override + public boolean equals(Object o) + { + if (o == null) { + return false; + } + if (getClass() != o.getClass()) { + return false; + } + + UnitaryAlignment comp = (UnitaryAlignment) o; + + if (this.arity() != comp.arity()) { + return false; + } + + if (!Objects.equals(this.getRaters(), comp.getRaters())) { + return false; + } + + for (Rater creator : this.getRaters()) { + if (!Objects.equals(this.getUnit(creator), comp.getUnit(creator))) { + return false; + } + } + + return true; + } + + @Override + public int compareTo(UnitaryAlignment ua) + { + + if (this.equals(ua)) { + return 0; + } + + if (ua == null) { + return -1; + } + + // order by arity + if (this.arity() != ua.arity()) { + return Integer.compare(this.arity(), ua.arity()); + } + + // order by annotator sets if they are not equal + if (!Objects.equals(this.getRaters(), ua.getRaters())) { + + // size of the sets are equal (due to equal arity) + // order by the first differing Annotator + + List annot_this = new ArrayList(new TreeSet(this.getRaters())); + List annot_comp = new ArrayList(new TreeSet(ua.getRaters())); + + for (int i = 0; i < annot_this.size(); i++) { + if (!Objects.equals(annot_this.get(i), annot_comp.get(i))) { + return annot_this.get(i).compareTo(annot_comp.get(i)); + } + } + } + + // order by the span covered by the units in this alignment + if (this.getBegin() != ua.getBegin()) { + return Long.compare(this.getBegin(), ua.getBegin()); + } + if (this.getEnd() != ua.getEnd()) { + return Long.compare(this.getEnd(), ua.getEnd()); + } + + // the numbers of units and the span covered are the same + // annotations must differ + // order by the first differing annotation + for (Rater creator : new ArrayList(new TreeSet(this.getRaters()))) { + if (!Objects.equals(this.getUnit(creator), ua.getUnit(creator))) { + if (this.getUnit(creator) == null) { + return 1; + } + else { + return this.getUnit(creator).compareTo(ua.getUnit(creator)); + } + } + } + + // should never be reached + return 0; + } + + @Override + public String toString() + { + return this.toString(new ArrayList()); + } + + public String toString(List attributes) + { + var us = new ArrayList(this.arity()); + + var empty = new StringBuilder(); + empty.append("--"); + for (int i = 0; i < attributes.size(); i++) { + empty.append("\t--"); + } + String empty_unit = empty.toString(); + + for (var creator : new ArrayList(new TreeSet(getRaters()))) { + if (this.getUnit(creator) != null) { + us.add(this.getUnit(creator).toString(attributes)); + } + else { + us.add(empty_unit); + } + } + + return String.join("\t", us); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AlignableAnnotationTextUnit.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AlignableAnnotationTextUnit.java new file mode 100644 index 0000000..e0b8820 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AlignableAnnotationTextUnit.java @@ -0,0 +1,132 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; + +public class AlignableAnnotationTextUnit + extends AlignableAnnotationUnit +{ + public static final String TYPE = "textunit"; + + private static final long serialVersionUID = 1093784487189619115L; + + protected final String text; + + public static AlignableAnnotationTextUnit textUnit(Rater aRater, long aBegin, long aEnd, + String aText) + { + return new AlignableAnnotationTextUnit(aRater, aBegin, aEnd, aText); + } + + public static AlignableAnnotationTextUnit textUnit(Rater aRater, long aBegin, long aEnd, + String aText, Map aFeatureSet) + { + return new AlignableAnnotationTextUnit(aRater, aBegin, aEnd, aText, aFeatureSet); + } + + public AlignableAnnotationTextUnit(Rater aRater, long aBegin, long aEnd, String aText) + { + this(aRater, aBegin, aEnd, aText, null); + } + + public AlignableAnnotationTextUnit(Rater aRater, long aBegin, long aEnd, String aText, + Map aFeatureSet) + { + super(aRater, TYPE, aBegin, aEnd, aFeatureSet); + + if (aText == null) { + aText = ""; + } + + text = aText; + } + + public String getText() + { + return text; + } + + @Override + public AlignableAnnotationTextUnit cloneWithDifferentLabel(String aType, String aLabel) + { + var feat = new HashMap<>(getFeatures()); + feat.put(aType, aLabel); + return new AlignableAnnotationTextUnit(getRater(), getBegin(), getEnd(), text, feat); + } + + @Override + public AlignableAnnotationTextUnit cloneWithDifferentOffsets(long aBegin, long aEnd) + { + return new AlignableAnnotationTextUnit(getRater(), aBegin, aEnd, text, getFeatures()); + } + + @Override + public AlignableAnnotationTextUnit cloneWithDifferentRater(Rater aCreator) + { + return new AlignableAnnotationTextUnit(aCreator, getBegin(), getEnd(), text, getFeatures()); + } + + public AlignableAnnotationTextUnit cloneWithDifferentText(String aText) + { + return new AlignableAnnotationTextUnit(getRater(), getBegin(), getEnd(), aText, + getFeatures()); + } + + @Override + public String toString(List attributes) + { + var ret = new StringBuilder(); + ret.append(this.getText()); + + for (var attribute : attributes) { + ret.append("\t"); + if (getFeatureValue(attribute) != null) { + ret.append(getFeatureValue(attribute)); + } + else { + ret.append("--"); + } + } + + return ret.toString(); + } + + @Override + public boolean equals(final Object other) + { + if (!(other instanceof AlignableAnnotationTextUnit)) { + return false; + } + if (!super.equals(other)) { + return false; + } + + AlignableAnnotationTextUnit castOther = (AlignableAnnotationTextUnit) other; + return Objects.equals(text, castOther.text); + } + + @Override + public int hashCode() + { + return Objects.hash(super.hashCode(), text); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedText.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedText.java new file mode 100644 index 0000000..fc126c6 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedText.java @@ -0,0 +1,63 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import java.util.Collection; +import java.util.List; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; + +public class AnnotatedText + extends AnnotationSet +{ + private final String text; + + public AnnotatedText(String aText, Collection aUnits) + { + super(aUnits); + + if (lowestOffset < 0) { + throw new IndexOutOfBoundsException("Lowest offset < 0"); + } + + if (highestOffset > aText.length()) { + throw new IndexOutOfBoundsException("Highest offset > length of text"); + } + + text = aText; + } + + @Override + public void addUnit(AlignableAnnotationUnit aUnit) + { + if (!(aUnit instanceof AlignableAnnotationTextUnit)) { + throw new IllegalArgumentException("AnnotatedText only accepts AlignableAnnotationTextUnit"); + } + + super.addUnit(aUnit); + } + + @SuppressWarnings({ "unchecked", "rawtypes" }) + public List getTextUnits() + { + return (List) (List) super.getUnits(); + } + + public String getText() + { + return text; + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMerge.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMerge.java new file mode 100644 index 0000000..cc451b1 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMerge.java @@ -0,0 +1,191 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static java.util.Arrays.asList; +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.CLOSE_UNIT; +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.OPEN_UNIT; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.alignment.Alignment; +import org.dkpro.statistics.agreement.aligning.alignment.PairwiseDPTextAlignment; +import org.dkpro.statistics.agreement.aligning.alignment.UnitaryAlignment; + +public class AnnotatedTextMerge +{ + /** + * Merges two AnnotatedText's using Needleman-Wunsch'- the annotations are not allowed to + * overlap + * + * @return a set of all possible segmentations of the resulting "Text" + */ + public static Set mergeAnnotatedTextsWithSegmentation( + AnnotatedText aText1, AnnotatedText aText2) + { + return mergeAnnotatedTextsWithSegmentation(aText1, aText2, -1); + } + + public static Set mergeAnnotatedTextsWithSegmentation( + AnnotatedText aText1, AnnotatedText aText2, + int aLevenshteinThreshold) + { + var aligner = new PairwiseDPTextAlignment(aText1, aText2); + + // get the alignment cost and throw an Exception if cost is above a threshold + if (aLevenshteinThreshold > -1) { + int lev = aligner.getInsertions() + aligner.getDeletions() + aligner.getSubstitutions(); + if (lev > aLevenshteinThreshold) { + throw new IllegalArgumentException( + "The texts are more different than given threshold."); + } + } + + // create annotation sets with the new annotations + var bestAlignedUnits = new HashSet>(); + int bestOverlap = 0; + + var allUnits = new ArrayList(2); + allUnits.add(aText1.getUnits().toArray(AlignableAnnotationUnit[]::new)); + allUnits.add(aText2.getUnits().toArray(AlignableAnnotationUnit[]::new)); + + for (var alignment : aligner.getAlignments()) { + var alignedAnnotations = getUnitsFromText(asList(alignment), allUnits); + AlignableAnnotationUnit[] units1 = alignedAnnotations.get(0); + AlignableAnnotationUnit[] units2 = alignedAnnotations.get(1); + + int overlap = countPairwiseColocatedUnits(units1, units2); + + // add only those with the best overlap + if (overlap >= bestOverlap) { + // found an alignment with better overlap than all alignments before + if (overlap > bestOverlap) { + bestAlignedUnits.clear(); + bestOverlap = overlap; + } + + var unitGroup = new ArrayList(); + unitGroup.addAll(asList(units1)); + unitGroup.addAll(asList(units2)); + + bestAlignedUnits.add(unitGroup); + } + } + + // create alignments + var alignmentSet = new HashSet(); + for (var units : bestAlignedUnits) { + var alignedUnits = new HashSet(); + var annoset = new AnnotationSet(units); + var alignments = new ArrayList(annoset.getRaterCount()); + + AlignableAnnotationUnit lastUnit = null; + for (var u : annoset.getUnits()) { + if (lastUnit != null && !u.isCoextensive(lastUnit)) { + alignedUnits.add(new UnitaryAlignment(alignments, annoset.getRaters())); + alignments.clear(); + } + + alignments.add(u); + lastUnit = u; + } + + if (!alignments.isEmpty()) { + alignedUnits.add(new UnitaryAlignment(alignments, annoset.getRaters())); + } + + alignmentSet.add(new Alignment(alignedUnits, annoset)); + } + + return alignmentSet; + + } + + private static int countPairwiseColocatedUnits(AlignableAnnotationUnit[] units1, + AlignableAnnotationUnit[] units2) + { + int overlap = 0; + for (var u1 : units1) { + for (var u2 : units2) { + if (u1.getBegin() == u2.getBegin() && u1.getEnd() == u2.getEnd()) { + overlap++; + } + } + } + return overlap; + } + + private static List getUnitsFromText(List texts, + List aAllUnits) + { + if (texts.isEmpty()) { + return null; + } + + if (texts.size() != aAllUnits.size()) { + // number of texts has to be equal to the number of annotation lists + throw new IllegalArgumentException("Numbers of annotation lists and texts differ."); + } + + int textLength = texts.get(0).length(); + for (var text : texts) { + if (text.length() != textLength) { + throw new IllegalArgumentException("Texts have to be of the same length."); + } + } + + var newUnits = new ArrayList(aAllUnits.size()); + var currentBegins = new ArrayList(aAllUnits.size()); + var annotNumbers = new ArrayList(aAllUnits.size()); + + for (int i = 0; i < aAllUnits.size(); i++) { + newUnits.add(new AlignableAnnotationUnit[aAllUnits.get(i).length]); + currentBegins.add(0); + annotNumbers.add(0); + } + + int offset = 0; + for (int i = 0; i < texts.get(0).length(); i++) { + // only change offset once per position + var offsetChange = false; + + for (int j = 0; j < texts.size(); j++) { + var text = texts.get(j); + if (text.charAt(i) == OPEN_UNIT) { + currentBegins.set(j, i - offset); + offsetChange = true; + } + else if (text.charAt(i) == CLOSE_UNIT) { + newUnits.get(j)[annotNumbers.get(j)] = aAllUnits.get(j)[annotNumbers.get(j)] + .cloneWithDifferentOffsets(currentBegins.get(j), i - offset); + + annotNumbers.set(j, annotNumbers.get(j) + 1); + offsetChange = true; + } + } + + if (offsetChange) { + offset += 1; + } + } + + return newUnits; + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSet.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSet.java new file mode 100644 index 0000000..ee29455 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSet.java @@ -0,0 +1,155 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static java.lang.Integer.MAX_VALUE; +import static java.util.Arrays.asList; +import static java.util.Collections.unmodifiableList; +import static java.util.Collections.unmodifiableSet; + +import java.util.Collection; +import java.util.HashSet; +import java.util.List; +import java.util.Objects; +import java.util.Set; +import java.util.TreeSet; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; + +/** + * a set of units created by a given set of annotators; all units are linked to the same (implicit) + * continuum + */ +public class AnnotationSet +{ + private final Set raters = new TreeSet(); + private final Set units = new TreeSet(); + private final Set featureNames = new HashSet(); + private final Set categories = new HashSet(); + + private List unitsListCache = null; + + protected long lowestOffset = MAX_VALUE; + protected long highestOffset = 0; + + public AnnotationSet(Collection aUnits) + { + for (var unit : aUnits) { + addUnit(unit); + } + } + + public void addUnit(AlignableAnnotationUnit unit) + { + if (unit.getBegin() < lowestOffset) { + lowestOffset = unit.getBegin(); + } + + if (unit.getEnd() > highestOffset) { + highestOffset = unit.getEnd(); + } + + categories.add(unit.getCategory()); + featureNames.addAll(unit.getFeatureNames()); + + raters.add(unit.getRater()); + units.add(unit); + + unitsListCache = null; + } + + public int getRaterCount() + { + return raters.size(); + } + + public int getUnitCount() + { + return units.size(); + } + + public double getAverageNumberOfAnnotations() + { + return getUnitCount() / ((double) getRaterCount()); + } + + public Set getRaters() + { + return unmodifiableSet(raters); + } + + public List getUnits() + { + if (unitsListCache == null) { + unitsListCache = unmodifiableList( + asList(units.toArray(AlignableAnnotationUnit[]::new))); + } + + return unitsListCache; + } + + public String[] getFeatureNames() + { + return featureNames.toArray(new String[0]); + } + + public List getUnitsWithType(String aType) + { + return units.stream() // + .filter(annot -> Objects.equals(annot.getType(), aType)) // + .toList(); + } + + public List getUnitsWithRater(Rater aRater) + { + return units.stream() // + .filter(annot -> Objects.equals(annot.getRater(), aRater)) // + .toList(); + } + + public boolean contains(AlignableAnnotationUnit u) + { + return units.contains(u); + } + + @Override + public boolean equals(Object o) + { + if (o == null) { + return false; + } + if (getClass() != o.getClass()) { + return false; + } + + AnnotationSet comp = (AnnotationSet) o; + boolean no_diff = true; + + var unitArray = getUnits().toArray(AlignableAnnotationUnit[]::new); + var otherUnitArray = comp.getUnits().toArray(AlignableAnnotationUnit[]::new); + for (int i = 0; i < unitArray.length; i++) { + no_diff = no_diff && unitArray[i].equals(otherUnitArray[i]); + } + + return no_diff; + } + + @Override + public int hashCode() + { + return units.hashCode() + raters.hashCode(); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/Rater.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/Rater.java new file mode 100644 index 0000000..2a0cad3 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/data/Rater.java @@ -0,0 +1,79 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import org.apache.commons.lang3.builder.ToStringBuilder; +import org.apache.commons.lang3.builder.ToStringStyle; + +public final class Rater + implements Comparable +{ + private final String name; + private final int index; + + public Rater(String aName, int aIndex) + { + name = aName; + index = aIndex; + } + + public String getName() + { + return name; + } + + public int getIndex() + { + return index; + } + + @Override + public boolean equals(Object o) + { + if (o == null) { + return false; + } + if (getClass() != o.getClass()) { + return false; + } + + if (name != null) { + return name.equals(((Rater) o).name); + } + else { + return ((Rater) o).name == null; + } + } + + @Override + public int hashCode() + { + return name.hashCode(); + } + + @Override + public int compareTo(Rater aOther) + { + return name.compareTo(aOther.name); + } + + @Override + public String toString() + { + return new ToStringBuilder(this, ToStringStyle.JSON_STYLE).appendSuper(super.toString()) + .append("name", name).append("index", index).toString(); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSampler.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSampler.java new file mode 100644 index 0000000..282cc31 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSampler.java @@ -0,0 +1,22 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.disorder; + +@FunctionalInterface +public interface IDisorderSampler +{ + Double sampleDisorder(); +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSamplerFactory.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSamplerFactory.java new file mode 100644 index 0000000..e2964fd --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/IDisorderSamplerFactory.java @@ -0,0 +1,26 @@ +/* + * Licensed to the Technische Universität Darmstadt under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The Technische Universität Darmstadt + * licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.dkpro.statistics.agreement.aligning.disorder; + +import org.dkpro.statistics.agreement.aligning.TextGammaAgreement; + +@FunctionalInterface +public interface IDisorderSamplerFactory +{ + IDisorderSampler create(TextGammaAgreement aMeasure); +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/SimpleDisorderSampler.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/SimpleDisorderSampler.java new file mode 100644 index 0000000..403d241 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/disorder/SimpleDisorderSampler.java @@ -0,0 +1,218 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.disorder; + +import static java.util.Arrays.asList; +import static org.dkpro.statistics.agreement.aligning.shuffling.AnnotationSetShuffle.changeSegmentation; +import static org.dkpro.statistics.agreement.aligning.shuffling.AnnotationSetShuffle.changeText; +import static org.dkpro.statistics.agreement.aligning.shuffling.AnnotationSetShuffle.randomizeFeatureValues; + +import java.lang.invoke.MethodHandles; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +import org.apache.commons.math3.distribution.BinomialDistribution; +import org.apache.commons.math3.distribution.EnumeratedDistribution; +import org.apache.commons.math3.util.Pair; +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.TextGammaAgreement; +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotatedText; +import org.dkpro.statistics.agreement.aligning.data.AnnotationSet; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.shuffling.SegmentationChangeType; +import org.dkpro.statistics.agreement.aligning.shuffling.TextChangeType; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class SimpleDisorderSampler + implements IDisorderSampler +{ + private static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass()); + + private final TextGammaAgreement measure; + + private final EnumeratedDistribution changeChooserText; + private final EnumeratedDistribution characterGenerator; + + private final EnumeratedDistribution changeChooserSeg; + + private final Map> labelGenerators; + + private final double textChangeRate; + private final double segmentChangeRate; + + // creates a SimpleDisorderSampler with uniform distribution over change types + public SimpleDisorderSampler(TextGammaAgreement aMeasure, double aTextChangeRate, + double aSegmentChangeRate) + { + measure = aMeasure; + textChangeRate = aTextChangeRate; + segmentChangeRate = aSegmentChangeRate; + + var units = new ArrayList(); + for (var text : measure.getTexts()) { + units.addAll(text.getUnits()); + } + + characterGenerator = createCharacterGenerator(units); + labelGenerators = createLabelGenerators(aMeasure.getTexts(), units); + + // set probabilities of change types to equal probabilities + var prop = 1 / (double) TextChangeType.values().length; + var pt = new ArrayList>(); + for (var type : TextChangeType.values()) { + pt.add(new Pair(type, prop)); + } + + changeChooserText = new EnumeratedDistribution(pt); + + prop = 1 / (double) SegmentationChangeType.values().length; + var ps = new ArrayList>(); + for (var type : SegmentationChangeType.values()) { + ps.add(new Pair(type, prop)); + } + + changeChooserSeg = new EnumeratedDistribution(ps); + } + + @Override + public Double sampleDisorder() + { + // REC: The original TextGamma had a "base text" which was used for disorder sampling here. + // However, since we have no "base text" in general, the easiest way is to sample the + // disorder on both texts and then take the mean average. Not sure if this is a viable idea, + // but it seems like a reasonable starting point. + var disorder = 0.0; + for (var text : measure.getTexts()) { + disorder += sampleDisorder(text); + } + + return disorder / (double) measure.getTexts().size(); + } + + private double sampleDisorder(AnnotatedText aText) + { + var textChangeSampler = new BinomialDistribution(aText.getUnitCount(), textChangeRate); + var segChangeSampler = new BinomialDistribution(aText.getUnitCount(), segmentChangeRate); + + // 1. apply textual changes + var version1 = changeText(aText, textChangeSampler.sample(), changeChooserText, + characterGenerator); + var version2 = changeText(aText, textChangeSampler.sample(), changeChooserText, + characterGenerator); + + // 2. apply segmentation changes + var set1 = changeSegmentation(new AnnotationSet(version1.getTextUnits()), + segChangeSampler.sample(), changeChooserSeg); + var set2 = changeSegmentation(new AnnotationSet(version2.getTextUnits()), + segChangeSampler.sample(), changeChooserSeg); + + // 3. apply categorization changes + for (var featureName : labelGenerators.keySet()) { + var labelGenerator = labelGenerators.get(featureName); + if (labelGenerator == null) { + throw new IllegalArgumentException( + "No generator for attribute " + featureName + " given."); + } + set1 = randomizeFeatureValues(set1, featureName, labelGenerator); + set2 = randomizeFeatureValues(set2, featureName, labelGenerator); + } + + // change the annotators + var a = new Rater("A", 0); + var b = new Rater("B", 1); + + var set1_arr = set1.getUnits().stream() // + .map(u -> u.cloneWithDifferentRater(a)) // + .map(u -> (AlignableAnnotationTextUnit) u) // + .toList(); + var set2_arr = set2.getUnits().stream() // + .map(u -> u.cloneWithDifferentRater(b)) // + .map(u -> (AlignableAnnotationTextUnit) u) // + .toList(); + + var disorder = TextGammaAgreement.getObservedDisorder( + new AnnotatedText(version1.getText(), set1_arr), + new AnnotatedText(version2.getText(), set2_arr), measure.getDissimilarity()); + + LOG.trace("Sampled disorder: {}", disorder); + + return disorder; + } + + private static EnumeratedDistribution createCharacterGenerator( + List units) + { + int numChars = 0; + var characters = new HashMap(); + for (var u : units) { + if (u instanceof AlignableAnnotationTextUnit textUnit) { + for (char c : textUnit.getText().toCharArray()) { + numChars += 1; + if (characters.containsKey(c)) { + characters.put(c, characters.get(c) + 1); + } + else { + characters.put(c, 1); + } + } + } + } + + var pc = new ArrayList>(characters.size()); + for (var c : characters.keySet()) { + pc.add(new Pair(c, characters.get(c) / (double) numChars)); + } + + return new EnumeratedDistribution(pc); + } + + private static HashMap> createLabelGenerators( + List aAnnotatedTexts, List units) + { + var featureNames = new HashSet(); + for (var text : aAnnotatedTexts) { + featureNames.addAll(asList(text.getFeatureNames())); + } + + var labelGenerators = new HashMap>(); + var labels = new HashMap(); + for (var featureName : featureNames) { + for (var u : units) { + var label = u.getFeatureValue(featureName); + if (labels.containsKey(u.getFeatureValue(featureName))) { + labels.put(label, labels.get(label) + 1); + } + else { + labels.put(label, 1); + } + } + + var pl = new ArrayList>(); + for (var label : labels.keySet()) { + pl.add(new Pair(label, labels.get(label) / (double) units.size())); + } + + labelGenerators.put(featureName, new EnumeratedDistribution(pl)); + } + + return labelGenerators; + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/IDissimilarity.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/IDissimilarity.java new file mode 100644 index 0000000..672ecd3 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/IDissimilarity.java @@ -0,0 +1,23 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.dissimilarity; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; + +public interface IDissimilarity +{ + double dissimilarity(AlignableAnnotationUnit u, AlignableAnnotationUnit v); +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureDissimilarity.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureDissimilarity.java new file mode 100644 index 0000000..60301f5 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureDissimilarity.java @@ -0,0 +1,93 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.dissimilarity; + +import java.util.HashSet; +import java.util.Objects; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; + +public class NominalFeatureDissimilarity + implements IDissimilarity +{ + @Override + public double dissimilarity(AlignableAnnotationUnit aUnit1, AlignableAnnotationUnit aUnit2) + { + if (aUnit1 == null && aUnit2 == null) { + return 0; + } + + if (aUnit1 == null) { + return 1; + } + + if (aUnit2 == null) { + return 1; + } + + if (!Objects.equals(aUnit1.getType(), aUnit2.getType())) { + return 1; + } + + return positionDissimilarity(aUnit1, aUnit2) + featureDissimilarity(aUnit1, aUnit2); + } + + protected Set getFeatureNames(AlignableAnnotationUnit... aUnits) + { + var featureNames = new HashSet(); + + for (var unit : aUnits) { + featureNames.addAll(unit.getFeatureNames()); + } + + return featureNames; + } + + protected int numberOfDissimilarFeatures(AlignableAnnotationUnit u, AlignableAnnotationUnit v) + { + var featureNames = getFeatureNames(u, v); + + int diffs = 0; + for (var attr : featureNames) { + if (!Objects.equals(u.getFeatureValue(attr), v.getFeatureValue(attr))) { + diffs += 1; + } + } + + return diffs; + } + + private double featureDissimilarity(AlignableAnnotationUnit u, AlignableAnnotationUnit v) + { + var featureNames = getFeatureNames(u, v); + + if (featureNames.isEmpty()) { + return 0; + } + + return this.numberOfDissimilarFeatures(u, v) / ((float) featureNames.size()); + } + + protected double positionDissimilarity(AlignableAnnotationUnit u, AlignableAnnotationUnit v) + { + if (u.getBegin() == v.getBegin() && u.getEnd() == v.getEnd()) { + return 0; + } + + return 1; + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureTextDissimilarity.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureTextDissimilarity.java new file mode 100644 index 0000000..ebadc63 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/dissimilarity/NominalFeatureTextDissimilarity.java @@ -0,0 +1,56 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.dissimilarity; + +import java.util.Objects; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; + +/** + * Variation of {@link NominalFeatureDissimilarity} that considers the covered text as an additional + * implicit feature. It requires {@link AlignableAnnotationTextUnit AlignableAnnotationTextUnits}. + */ +public class NominalFeatureTextDissimilarity + extends NominalFeatureDissimilarity +{ + @Override + public double dissimilarity(AlignableAnnotationUnit aUnit1, AlignableAnnotationUnit aUnit2) + { + if (aUnit1 == null && aUnit2 == null) { + return 0; + } + + if (aUnit1 == null) { + return 1; + } + + if (aUnit2 == null) { + return 1; + } + + if (aUnit1 instanceof AlignableAnnotationTextUnit unit1 + && aUnit2 instanceof AlignableAnnotationTextUnit unit2) { + var textDiff = Objects.equals(unit1.getText(), unit2.getText()) ? 0 : 1; + + return positionDissimilarity(aUnit1, aUnit2) + + (numberOfDissimilarFeatures(aUnit1, aUnit2) + textDiff) + / ((float) getFeatureNames(aUnit1, aUnit2).size() + 1); + } + + throw new IllegalArgumentException("Units have to be TextUnits"); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/AnnotationSetShuffle.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/AnnotationSetShuffle.java new file mode 100644 index 0000000..cb744f5 --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/AnnotationSetShuffle.java @@ -0,0 +1,613 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.shuffling; + +import static org.dkpro.statistics.agreement.aligning.shuffling.TextChangeType.DELETION; +import static org.dkpro.statistics.agreement.aligning.shuffling.TextChangeType.INSERTION; +import static org.dkpro.statistics.agreement.aligning.shuffling.TextChangeType.SUBSTITUTION; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.HashSet; +import java.util.LinkedList; +import java.util.List; +import java.util.Objects; +import java.util.Random; +import java.util.Set; + +import org.apache.commons.math3.distribution.EnumeratedDistribution; +import org.apache.commons.math3.util.Pair; +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotatedText; +import org.dkpro.statistics.agreement.aligning.data.AnnotationSet; + +public class AnnotationSetShuffle +{ + + // 1. Change the text for AnnotatedText + // Important: the functions changing the text assume a segmentation, i.e. the units may not + // overlap this is not tested; if this assumption is violated, some units may not be adapted + // correctly + + public static AnnotatedText shuffleText(AnnotatedText orig, double m) + { + if (!(0 <= m && m <= 1)) { + throw new IllegalArgumentException("Magnitude has to be between 0 and 1"); + } + + var numChanges = (int) (orig.getUnitCount() * m); + + var prop = 1 / (double) TextChangeType.values().length; + var pt = new ArrayList>(TextChangeType.values().length); + + for (var type : TextChangeType.values()) { + pt.add(new Pair(type, prop)); + } + var changes = new EnumeratedDistribution(pt); + + var numChars = 0; + var characters = new HashMap(); + for (var unit : orig.getUnits()) { + if (unit instanceof AlignableAnnotationTextUnit textUnit) { + for (char c : textUnit.getText().toCharArray()) { + numChars += 1; + if (characters.containsKey(c)) { + characters.put(c, characters.get(c) + 1); + } + else { + characters.put(c, 1); + } + } + } + } + + var pc = new ArrayList>(); + for (var c : characters.keySet()) { + pc.add(new Pair(c, characters.get(c) / (double) numChars)); + } + + var characterGen = new EnumeratedDistribution(pc); + + return changeText(orig, numChanges, changes, characterGen); + } + + public static AnnotatedText changeText(AnnotatedText orig, int changes, + EnumeratedDistribution changeChooser, + EnumeratedDistribution characterGenerator) + { + return changeText(orig, changes, changeChooser, characterGenerator, new Random()); + } + + public static AnnotatedText changeText(AnnotatedText orig, int changes, + EnumeratedDistribution changeChooser, + EnumeratedDistribution characterGenerator, Random positionChooser) + { + + var text = new StringBuilder(orig.getText()); + var annots = new ArrayList<>(orig.getTextUnits()); + + var changedUnits = new HashSet(); + + // avoid infinite loop if changes > number of annotations! + if (changes > annots.size()) { + changes = annots.size(); + } + + for (int i = 0; i < changes; i++) { + + boolean changed = false; + while (!changed) { + TextChangeType type = changeChooser.sample(); + if (type.equals(INSERTION)) { + changeTextInsertion(text, annots, characterGenerator, positionChooser, + changedUnits); + changed = true; + } + else if (type.equals(DELETION)) { + // changeTextDeletion(text, annots, positionChooser, changed_units); + changed = changeTextDeletionWithoutUnit(text, annots, positionChooser, + changedUnits); + } + else if (type.equals(SUBSTITUTION)) { + changeTextSubstitution(text, annots, characterGenerator, positionChooser, + changedUnits); + changed = true; + } + } + + } + + return new AnnotatedText(text.toString(), annots); + } + + private static int pickUnitToChange(Random positionGenerator, int num_annotations, + Set changed_units) + { + + // avoid infinite loops if all units are marked as changed + // should not happen as the number of changes is restricted to the number of units! + if (changed_units.size() == num_annotations) { + new RuntimeException("All units have been changed; but more changes are needed"); + } + + int unit_offset = positionGenerator.nextInt(num_annotations); + while (changed_units.contains(unit_offset)) { + unit_offset = positionGenerator.nextInt(num_annotations); + } + changed_units.add(unit_offset); + + return unit_offset; + } + + private static int pickTextOffset(Random positionGenerator, AlignableAnnotationUnit u) + { + // pick a position for the change + int pos_offset = positionGenerator.nextInt((int) (u.getEnd() - u.getBegin())); + return (int) (u.getBegin() + pos_offset); + + } + + public static AnnotatedText changeTextInsertion(AnnotatedText orig, + EnumeratedDistribution characterGenerator) + { + return AnnotationSetShuffle.changeTextInsertion(orig, characterGenerator, new Random()); + } + + public static AnnotatedText changeTextInsertion(AnnotatedText orig, + EnumeratedDistribution characterGenerator, Random positionGenerator) + { + var text = new StringBuilder(orig.getText()); + var annots = new ArrayList<>(orig.getTextUnits()); + + AnnotationSetShuffle.changeTextInsertion(text, annots, characterGenerator, + positionGenerator, new HashSet()); + + return new AnnotatedText(text.toString(), annots); + } + + @SuppressWarnings("unchecked") + private static void changeTextInsertion(StringBuilder orig, + List annotations, EnumeratedDistribution characterGenerator, + Random positionGenerator, Set aChangedUnits) + { + + // pick a unit and position to change + int unitIndex = pickUnitToChange(positionGenerator, annotations.size(), aChangedUnits); + AlignableAnnotationUnit u = annotations.get(unitIndex); + // pick a position for the change + int pos_offset = positionGenerator.nextInt((int) (u.getEnd() - u.getBegin() + 1)); + int pos = (int) (u.getBegin() + pos_offset); + + // update the text + orig.insert(pos, characterGenerator.sample().charValue()); + + // move the end of the unit to the right, change text if textunit + u = u.cloneWithDifferentOffsets(u.getBegin(), u.getEnd() + 1); + if (u instanceof AlignableAnnotationTextUnit textUnit) { + u = textUnit + .cloneWithDifferentText(orig.substring((int) u.getBegin(), (int) u.getEnd())); + } + annotations.set(unitIndex, (T) u); + + // move all following units to the right + for (int i = unitIndex + 1; i < annotations.size(); i++) { + u = annotations.get(i); + annotations.set(i, (T) u.cloneWithDifferentOffsets(u.getBegin() + 1, u.getEnd() + 1)); + } + } + + public static AnnotatedText changeTextDeletion(AnnotatedText orig) + { + return AnnotationSetShuffle.changeTextDeletion(orig, new Random()); + } + + public static AnnotatedText changeTextDeletion(AnnotatedText orig, Random positionGenerator) + { + var text = new StringBuilder(orig.getText()); + var annots = new ArrayList<>(orig.getTextUnits()); + + changeTextDeletion(text, annots, positionGenerator, new HashSet()); + + return new AnnotatedText(text.toString(), annots); + } + + @SuppressWarnings("unchecked") + private static void changeTextDeletion(StringBuilder orig, + List annotations, Random positionGenerator, Set changed_units) + { + + // pick a unit and position to change + int unit_offset = pickUnitToChange(positionGenerator, annotations.size(), changed_units); + AlignableAnnotationUnit u = annotations.get(unit_offset); + int pos = pickTextOffset(positionGenerator, u); + + // update the text + orig.deleteCharAt(pos); + + if (u.getEnd() - u.getBegin() > 1) { + // move the end of the unit to the left, change text if textunit + u = u.cloneWithDifferentOffsets(u.getBegin(), u.getEnd() - 1); + if (u instanceof AlignableAnnotationTextUnit textUnit) { + u = textUnit.cloneWithDifferentText( + orig.substring((int) u.getBegin(), (int) u.getEnd())); + } + annotations.set(unit_offset, (T) u); + } + else { + // remove the unit if it only spans the deleted char + annotations.remove(unit_offset); + unit_offset -= 1; + } + + // move all following units to the left + for (int i = unit_offset + 1; i < annotations.size(); i++) { + u = annotations.get(i); + annotations.set(i, (T) u.cloneWithDifferentOffsets(u.getBegin() - 1, u.getEnd() - 1)); + } + } + + @SuppressWarnings("unchecked") + private static boolean changeTextDeletionWithoutUnit( + StringBuilder orig, List annotations, Random positionGenerator, + Set changed_units) + { + + // pick a unit to change + int unit_offset = positionGenerator.nextInt(annotations.size()); + while (changed_units.contains(unit_offset)) { + unit_offset = positionGenerator.nextInt(annotations.size()); + } + + AlignableAnnotationUnit u = annotations.get(unit_offset); + // pick a position for the change + int pos_offset = positionGenerator.nextInt((int) (u.getEnd() - u.getBegin())); + + int pos = (int) annotations.get(unit_offset).getBegin() + pos_offset; + + if (u.getEnd() - u.getBegin() == 1) { + // don't change the text + return false; + } + + changed_units.add(unit_offset); + + // update the text + orig.deleteCharAt(pos); + + // move the end of the unit to the left, change text if textunit + u = u.cloneWithDifferentOffsets(u.getBegin(), u.getEnd() - 1); + if (u instanceof AlignableAnnotationTextUnit textUnit) { + u = textUnit + .cloneWithDifferentText(orig.substring((int) u.getBegin(), (int) u.getEnd())); + } + annotations.set(unit_offset, (T) u); + + // move all following units to the left + for (int i = unit_offset + 1; i < annotations.size(); i++) { + u = annotations.get(i); + annotations.set(i, (T) u.cloneWithDifferentOffsets(u.getBegin() - 1, u.getEnd() - 1)); + } + + return true; + } + + public static AnnotatedText changeTextSubstitution(AnnotatedText orig, + EnumeratedDistribution characterGenerator) + { + return AnnotationSetShuffle.changeTextSubstitution(orig, characterGenerator, new Random()); + } + + public static AnnotatedText changeTextSubstitution(AnnotatedText orig, + EnumeratedDistribution characterGenerator, Random positionGenerator) + { + var text = new StringBuilder(orig.getText()); + var annots = new ArrayList<>(orig.getTextUnits()); + + AnnotationSetShuffle.changeTextSubstitution(text, annots, characterGenerator, + positionGenerator, new HashSet()); + + return new AnnotatedText(text.toString(), annots); + } + + @SuppressWarnings("unchecked") + private static void changeTextSubstitution( + StringBuilder orig, List annotations, + EnumeratedDistribution characterGenerator, Random positionGenerator, + Set changed_units) + { + + // pick a unit and position to change + int unit_offset = pickUnitToChange(positionGenerator, annotations.size(), changed_units); + AlignableAnnotationUnit u = annotations.get(unit_offset); + int pos = pickTextOffset(positionGenerator, u); + + // update the text + char orig_char = orig.charAt(pos); + Character new_char = characterGenerator.sample(); + while (new_char.charValue() == orig_char) { + new_char = characterGenerator.sample(); + } + orig.replace(pos, pos + 1, new_char.toString()); + + // change text of the unit if textunit + if (u instanceof AlignableAnnotationTextUnit textUnit) { + annotations.set(unit_offset, (T) textUnit + .cloneWithDifferentText(orig.substring((int) u.getBegin(), (int) u.getEnd()))); + } + } + + // 2. Change the segmentation for AnnotationSet + + public static AnnotationSet shuffleSegmentation(AnnotationSet orig, double m) + { + if (!(0 <= m && m <= 1)) { + throw new IllegalArgumentException("Magnitude has to be between 0 and 1"); + } + + int numChanges = (int) (orig.getUnitCount() * m); + + double prop = 1 / (double) SegmentationChangeType.values().length; + var ps = new ArrayList>( + SegmentationChangeType.values().length); + for (SegmentationChangeType type : SegmentationChangeType.values()) { + ps.add(new Pair(type, prop)); + } + var changes = new EnumeratedDistribution(ps); + + return changeSegmentation(orig, numChanges, changes); + } + + public static AnnotationSet changeSegmentation(AnnotationSet orig, int changes, + EnumeratedDistribution changeChooser) + { + return AnnotationSetShuffle.changeSegmentation(orig, changes, 0, changeChooser); + } + + public static AnnotationSet changeSegmentation(AnnotationSet orig, int changes, int merge_gap, + EnumeratedDistribution changeChooser) + { + return AnnotationSetShuffle.changeSegmentation(orig, changes, merge_gap, changeChooser, + new Random()); + } + + public static AnnotationSet changeSegmentation(AnnotationSet orig, int changes, + EnumeratedDistribution changeChooser, Random positionChooser) + { + return changeSegmentation(orig, changes, 0, changeChooser, positionChooser); + } + + public static AnnotationSet changeSegmentation(AnnotationSet orig, int changes, int merge_gap, + EnumeratedDistribution changeChooser, Random positionChooser) + { + var annots = new ArrayList<>(orig.getUnits()); + + for (int i = 0; i < changes; i++) { + SegmentationChangeType type = changeChooser.sample(); + if (type.equals(SegmentationChangeType.MERGE)) { + changeSegmentationMerge(annots, merge_gap, positionChooser); + } + if (type.equals(SegmentationChangeType.SPLIT)) { + changeSegmentationSplit(annots, positionChooser); + } + } + + return new AnnotationSet(annots); + } + + public static AnnotationSet changeSegmentationMerge(AnnotationSet orig) + { + return changeSegmentationMerge(orig, new Random()); + } + + public static AnnotationSet changeSegmentationMerge(AnnotationSet orig, + Random positionGenerator) + { + return changeSegmentationMerge(orig, 0, positionGenerator); + } + + public static AnnotationSet changeSegmentationMerge(AnnotationSet orig, int gap) + { + return changeSegmentationMerge(orig, gap, new Random()); + } + + public static AnnotationSet changeSegmentationMerge(AnnotationSet orig, int gap, + Random positionGenerator) + { + var annots = new LinkedList(orig.getUnits()); + AnnotationSetShuffle.changeSegmentationMerge(annots, gap, positionGenerator); + return new AnnotationSet(annots); + } + + private static void changeSegmentationMerge(List annotations, int gap, + Random positionGenerator) + { + // pick a unit to merge + int unit_offset = positionGenerator.nextInt(annotations.size()); + AlignableAnnotationUnit base = annotations.get(unit_offset); + AlignableAnnotationUnit merge = null; + + int searchOffset = 1; + while (unit_offset + searchOffset < annotations.size() + && annotations.get(unit_offset + searchOffset).getBegin() <= base.getEnd() + gap) { + + merge = annotations.get(unit_offset + searchOffset); + // test if merge is a valid merge_candidate + if (merge.getBegin() >= base.getEnd() + && Objects.equals(base.getRater(), merge.getRater()) + && Objects.equals(base.getCategory(), merge.getCategory())) { + break; + } + + merge = null; + searchOffset += 1; + } + + if (merge != null) { + annotations.set(unit_offset, + base.cloneWithDifferentOffsets(base.getBegin(), merge.getEnd())); + if (base instanceof AlignableAnnotationTextUnit) { + annotations.set(unit_offset, + ((AlignableAnnotationTextUnit) annotations.get(unit_offset)) + .cloneWithDifferentText( + ((AlignableAnnotationTextUnit) base).getText() + + (merge.getBegin() - base.getEnd() > 0 ? " " : "") + + ((AlignableAnnotationTextUnit) merge).getText())); + } + annotations.remove(unit_offset + searchOffset); + } + + } + + public static AnnotationSet changeSegmentationSplit(AnnotationSet orig) + { + return changeSegmentationSplit(orig, new Random()); + } + + public static AnnotationSet changeSegmentationSplit(AnnotationSet orig, + Random positionGenerator) + { + var annots = new LinkedList(orig.getUnits()); + AnnotationSetShuffle.changeSegmentationSplit(annots, positionGenerator); + return new AnnotationSet(annots); + } + + private static void changeSegmentationSplit(List annotations, + Random positionGenerator) + { + + // pick a unit to change + int unit_offset = positionGenerator.nextInt(annotations.size()); + AlignableAnnotationUnit u = annotations.get(unit_offset); + + if (u.getEnd() - u.getBegin() <= 1) { + // split not possible + return; + } + + // pick a position for the split + int pos = positionGenerator.nextInt((int) u.getEnd() - (int) u.getBegin() - 1) + 1; + + AlignableAnnotationUnit u1 = u.cloneWithDifferentOffsets(u.getBegin(), u.getBegin() + pos); + AlignableAnnotationUnit u2 = u.cloneWithDifferentOffsets(u.getBegin() + pos, u.getEnd()); + + if (u instanceof AlignableAnnotationTextUnit) { + u1 = ((AlignableAnnotationTextUnit) u1).cloneWithDifferentText( + ((AlignableAnnotationTextUnit) u).getText().substring(0, pos)); + u2 = ((AlignableAnnotationTextUnit) u2).cloneWithDifferentText( + ((AlignableAnnotationTextUnit) u).getText().substring(pos)); + } + + annotations.set(unit_offset, u1); + annotations.add(unit_offset + 1, u2); + } + + // 3. Create random labels for AnnotationSet + + public static AnnotationSet randomizeFeatureValues(AnnotationSet orig, String aFeatureName, + EnumeratedDistribution labelGenerator) + { + var relabeled = new ArrayList(); + + for (var annot : orig.getUnits()) { + relabeled.add(annot.cloneWithDifferentLabel(aFeatureName, labelGenerator.sample())); + } + + return new AnnotationSet(relabeled); + } + + public static AnnotationSet shuffleAttributeValues(AnnotationSet orig, String attribute, + double m) + { + if (!(0 <= m && m <= 1)) { + throw new IllegalArgumentException("Magnitude has to be between 0 and 1"); + } + + int numChanges = (int) (orig.getUnitCount() * m); + + var labels = new HashMap(); + + for (var u : orig.getUnits()) { + var label = u.getFeatureValue(attribute); + if (labels.containsKey(u.getFeatureValue(attribute))) { + labels.put(label, labels.get(label) + 1); + } + else { + labels.put(label, 1); + } + } + + var ps = new ArrayList>(labels.size()); + for (var label : labels.keySet()) { + ps.add(new Pair(label, + labels.get(label) / (double) orig.getUnitCount())); + } + + var labelGenerator = new EnumeratedDistribution<>(ps); + + return changeAttributeValues(orig, attribute, numChanges, labelGenerator); + } + + public static AnnotationSet changeAttributeValues(AnnotationSet orig, String attribute, + int changes, EnumeratedDistribution labelGenerator) + { + return changeAttributeValues(orig, attribute, changes, labelGenerator, new Random()); + } + + public static AnnotationSet changeAttributeValues(AnnotationSet orig, String attribute, + int changes, EnumeratedDistribution labelGenerator, Random positionChooser) + { + var units = new ArrayList<>(orig.getUnits()); + + var changedUnits = new HashSet(); + + for (var i = 0; i < changes; i++) { + changeAttributeValue(units, attribute, labelGenerator, positionChooser, changedUnits); + } + + return new AnnotationSet(units); + } + + public static AnnotationSet changeAttributeValue(AnnotationSet orig, String attribute, + EnumeratedDistribution labelGenerator) + { + return changeAttributeValue(orig, attribute, labelGenerator, new Random()); + } + + public static AnnotationSet changeAttributeValue(AnnotationSet orig, String attribute, + EnumeratedDistribution labelGenerator, Random positionGenerator) + { + var annots = new ArrayList<>(orig.getUnits()); + changeAttributeValue(annots, attribute, labelGenerator, positionGenerator, new HashSet<>()); + return new AnnotationSet(annots); + } + + private static void changeAttributeValue(List aUnits, String attribute, + EnumeratedDistribution labelGenerator, Random positionGenerator, + Set changed_units) + { + // pick a unit to change + int unitIndex = pickUnitToChange(positionGenerator, aUnits.size(), changed_units); + + var origLabel = aUnits.get(unitIndex).getFeatureValue(attribute); + var newLabel = labelGenerator.sample(); + while (newLabel.equals(origLabel)) { + newLabel = labelGenerator.sample(); + } + + aUnits.set(unitIndex, aUnits.get(unitIndex).cloneWithDifferentLabel(attribute, newLabel)); + } +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/SegmentationChangeType.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/SegmentationChangeType.java new file mode 100644 index 0000000..6f52ddd --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/SegmentationChangeType.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.shuffling; + +public enum SegmentationChangeType +{ + MERGE, SPLIT +} diff --git a/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/TextChangeType.java b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/TextChangeType.java new file mode 100644 index 0000000..5361dbc --- /dev/null +++ b/dkpro-statistics-agreement/src/main/java/org/dkpro/statistics/agreement/aligning/shuffling/TextChangeType.java @@ -0,0 +1,21 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.shuffling; + +public enum TextChangeType +{ + INSERTION, DELETION, SUBSTITUTION +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/AnnotationStudyTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/AnnotationStudyTest.java index 0b403ce..54d8b33 100644 --- a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/AnnotationStudyTest.java +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/AnnotationStudyTest.java @@ -28,7 +28,7 @@ public class AnnotationStudyTest @Test void testAddItem() { - CodingAnnotationStudy study = new CodingAnnotationStudy(3); + var study = new CodingAnnotationStudy(3); study.addItem("A", "B", "C"); study.addItem(5, 3, 0); study.addItem(new Object(), "c", 12); @@ -40,8 +40,8 @@ void testAddItem() @Test void testAddItemMissingUnits() { - CodingAnnotationStudy study = new CodingAnnotationStudy(3); - + var study = new CodingAnnotationStudy(3); + assertThatExceptionOfType(IllegalArgumentException.class) .isThrownBy(() -> study.addItem("A")); diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreementTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreementTest.java new file mode 100644 index 0000000..91a314f --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/TextGammaAgreementTest.java @@ -0,0 +1,253 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.offset; +import static org.dkpro.statistics.agreement.aligning.TextGammaAgreement.calculateExpectedDisagreement; +import static org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit.textUnit; +import static org.junit.jupiter.params.provider.Arguments.arguments; + +import java.util.List; +import java.util.Map; + +import org.apache.commons.math3.distribution.NormalDistribution; +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotatedText; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.disorder.IDisorderSampler; +import org.dkpro.statistics.agreement.aligning.disorder.SimpleDisorderSampler; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureTextDissimilarity; +import org.junit.jupiter.api.Test; +import org.junit.jupiter.params.ParameterizedTest; +import org.junit.jupiter.params.provider.Arguments; +import org.junit.jupiter.params.provider.MethodSource; + +class TextGammaAgreementTest +{ + private static final Rater ANNOTATOR_A = new Rater("A", 0); + private static final Rater ANNOTATOR_B = new Rater("B", 1); + private static final List TOTAL_TEXT_DISAGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c"), // + textUnit(ANNOTATOR_B, 0, 2, "c"), // + textUnit(ANNOTATOR_B, 3, 5, "a"), // + textUnit(ANNOTATOR_B, 6, 8, "b")); + + private static final List SOME_TEXT_DISAGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c"), // + textUnit(ANNOTATOR_B, 0, 2, "a"), // + textUnit(ANNOTATOR_B, 3, 5, "c")); + private static final List MISSING_ANNOTATION = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c"), // + textUnit(ANNOTATOR_B, 3, 5, "b"), // + textUnit(ANNOTATOR_B, 6, 8, "c")); + + private static final List FULL_TEXT_AGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c"), // + textUnit(ANNOTATOR_B, 0, 2, "a"), // + textUnit(ANNOTATOR_B, 3, 5, "b"), // + textUnit(ANNOTATOR_B, 6, 8, "c")); + + static List data_NormalDistributionDisorder_NominalFeatureTextDissimilarity() + { + return asList( // + arguments("Full text agreement", 1.0, "so so so", FULL_TEXT_AGREEMENT), // + arguments("Missing annotation", 0.6, "so so so", MISSING_ANNOTATION), // + arguments("Some text disagreement", 0.2, "so so so", SOME_TEXT_DISAGREEMENT), // + arguments("Total text disagreement", 0.0, "so so so", TOTAL_TEXT_DISAGREEMENT) // + ); + } + + @ParameterizedTest(name = "[{index}] {0}") + @MethodSource("data_NormalDistributionDisorder_NominalFeatureTextDissimilarity") + void testCalculateTextAgreement_NormalDistributionDisorder_NominalFeatureTextDissimilarity( + String aLabel, double aExpectedAgreement, String aText, + List aData) + { + var study = new TextAligningAnnotationStudy(aText); + study.addUnits(aData); + + var sut = TextGammaAgreement.builder() // + .withDisorderSampler(new NormalDistributionDisorderSampler()) // + .withDissimilarity(new NominalFeatureTextDissimilarity()) // + .withStudy(study) // + .build(); + + assertThat(sut.calculateAgreement()).isCloseTo(aExpectedAgreement, offset(0.01)); + } + + private static final List FULL_LABEL_AGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_A, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_A, 6, 8, "c", Map.of("label", "C")), // + textUnit(ANNOTATOR_B, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_B, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_B, 6, 8, "c", Map.of("label", "C"))); + + private static final List SOME_LABEL_DISAGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_A, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_A, 6, 8, "c", Map.of("label", "C")), // + textUnit(ANNOTATOR_B, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_B, 3, 5, "b", Map.of("label", "A")), // + textUnit(ANNOTATOR_B, 6, 8, "c", Map.of("label", "C"))); + + private static final List MISSING_LABEL = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_A, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_A, 6, 8, "c", Map.of("label", "C")), // + textUnit(ANNOTATOR_B, 0, 2, "a"), // + textUnit(ANNOTATOR_B, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_B, 6, 8, "c", Map.of("label", "C"))); + + private static final List TOTAL_LABEL_DISAGREEMENT = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a", Map.of("label", "A")), // + textUnit(ANNOTATOR_A, 3, 5, "b", Map.of("label", "B")), // + textUnit(ANNOTATOR_A, 6, 8, "c", Map.of("label", "C")), // + textUnit(ANNOTATOR_B, 0, 2, "a", Map.of("label", "B")), // + textUnit(ANNOTATOR_B, 3, 5, "b", Map.of("label", "C")), // + textUnit(ANNOTATOR_B, 6, 8, "c", Map.of("label", "A"))); + + static List data_SimpleDisorder_NominalFeatureTextDissimilarity() + { + return asList( // + arguments("Full label agreement", 1.0, "so so so", FULL_LABEL_AGREEMENT), // + arguments("Missing label", 0.53, "so so so", MISSING_LABEL), // + arguments("Some label disagreement", 0.45, "so so so", SOME_LABEL_DISAGREEMENT), // + arguments("Total label disagreement", -0.5, "so so so", TOTAL_LABEL_DISAGREEMENT) // + ); + } + + @ParameterizedTest(name = "[{index}] {0}") + @MethodSource("data_SimpleDisorder_NominalFeatureTextDissimilarity") + void testCalculateTextAgreement_SimpleDisorder_NominalFeatureTextDissimilarity( + String aLabel, double aExpectedAgreement, String aText, + List aData) + { + var study = new TextAligningAnnotationStudy(aText); + study.addUnits(aData); + + var sut = TextGammaAgreement.builder() // + .withDisorderSampler(m -> new SimpleDisorderSampler(m, 0.0, 0.0)) // + .withDissimilarity(new NominalFeatureTextDissimilarity()) // + .withStudy(study) // + .build(); + + assertThat(sut.calculateAgreement()).isCloseTo(aExpectedAgreement, offset(0.02)); + } + + @Test + void testCalculateTextAgreementWithTexts_LabelTextPositionDisagreement() + { + var text1 = new AnnotatedText("so so so", asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c"))); + + var text2 = new AnnotatedText("so so", asList( // + textUnit(ANNOTATOR_B, 0, 2, "b"), // + textUnit(ANNOTATOR_B, 3, 5, "c"))); + + var sut = TextGammaAgreement.builder() // + .withDisorderSampler(new NormalDistributionDisorderSampler()) // + .withDissimilarity(new NominalFeatureTextDissimilarity()) // + .withTexts(text1, text2) // + .build(); + + assertThat(sut.calculateAgreement()).isCloseTo(0.6, offset(0.005)); + } + + @Test + void testGetObservedDisorder() + { + var annots1 = asList( // + textUnit(ANNOTATOR_A, 0, 2, "a"), // + textUnit(ANNOTATOR_A, 3, 5, "b"), // + textUnit(ANNOTATOR_A, 6, 8, "c")); + var text1 = new AnnotatedText("so so so", annots1); + + var annots2 = asList( // + textUnit(ANNOTATOR_B, 0, 2, "b"), // + textUnit(ANNOTATOR_B, 3, 5, "c")); + var text2 = new AnnotatedText("so so", annots2); + + var sut = TextGammaAgreement.builder() // + .withDisorderSampler(() -> 0.0) // + .withTexts(text1, text2) // + .build(); + + assertThat(sut.calculateObservedDisagreement()).isCloseTo(0.4, offset(0.001)); + } + + @Test + void testCalculateExpectedDisagreement_NormalDistribution() + { + var disorderSampler = new NormalDistributionDisorderSampler(); + var outerIterations = 100; + var innerIterations = 100; + var precision = 0.02; + var alpha = 0.05; + + for (int n = 0; n < outerIterations; n++) { + var correct = 0; + var tooSmall = 0; + var tooBig = 0; + + for (var i = 0; i < innerIterations; i++) { + var expectedDisorder = calculateExpectedDisagreement(disorderSampler, alpha, + precision); + + if (1 - precision <= expectedDisorder && expectedDisorder <= 1 + precision) { + correct += 1; + } + else if (expectedDisorder < 1 - precision) { + tooSmall += 1; + } + else { + tooBig += 1; + } + } + + assertThat(correct + tooBig + tooSmall).isEqualTo(innerIterations); + + assertThat(correct / (double) innerIterations) + .as("Correct %d -- too big: %d - too small: %d", correct, tooSmall, tooBig) + // should be about 95% -- but seems to drop lower + .isGreaterThanOrEqualTo(0.89); + } + } + + private class NormalDistributionDisorderSampler + implements IDisorderSampler + { + private final NormalDistribution sn = new NormalDistribution(1, 1); + + @Override + public Double sampleDisorder() + { + return sn.sample(); + } + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/AlignmentTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/AlignmentTest.java new file mode 100644 index 0000000..f4b9305 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/AlignmentTest.java @@ -0,0 +1,208 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.assertj.core.api.Assertions.offset; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotationSet; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureDissimilarity; +import org.junit.jupiter.api.Test; + +public class AlignmentTest +{ + private static final Rater ANNOTATOR_1 = new Rater("1", 0); + private static final Rater ANNOTATOR_2 = new Rater("2", 1); + private static final Rater ANNOTATOR_3 = new Rater("3", 1); + + @Test + public void testAlignmentWithDifferingCreators() + { + Set uas = new HashSet(); + + List annotations = new ArrayList(4); + + Set creators = new HashSet(); + creators.add(ANNOTATOR_1); + creators.add(ANNOTATOR_2); + + List units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + creators.add(ANNOTATOR_3); + + units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 1, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new Alignment(uas, new AnnotationSet(annotations)); + }); + } + + @Test + void testAlignmentWithUnitContainedTwice() + { + + Set uas = new HashSet(); + + List annotations = new ArrayList(4); + + Set creators = new HashSet(); + creators.add(ANNOTATOR_1); + creators.add(ANNOTATOR_2); + + List units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 1, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new Alignment(uas, new AnnotationSet(annotations)); + }); + } + + @Test + public void testAlignmentWithUnitNotContainedInAS() + { + + Set uas = new HashSet(); + + List annotations = new ArrayList(2); + + Set creators = new HashSet(); + creators.add(ANNOTATOR_1); + creators.add(ANNOTATOR_2); + + List units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 1, 3, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new Alignment(uas, new AnnotationSet(annotations)); + }); + } + + @Test + public void testAlignmentWithUnitNotContainedInUA() + { + + Set uas = new HashSet(); + + List annotations = new ArrayList(3); + + Set creators = new HashSet(); + creators.add(ANNOTATOR_1); + creators.add(ANNOTATOR_2); + + List units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + annotations.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 1, 3, null)); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new Alignment(uas, new AnnotationSet(annotations)); + }); + } + + @Test + public void testGetDisorder() + { + + Set uas = new HashSet(); + + Set creators = new HashSet(); + creators.add(ANNOTATOR_1); + creators.add(ANNOTATOR_2); + creators.add(ANNOTATOR_3); + + List annotations = new ArrayList(4); + + List units = new ArrayList(3); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 0, 3, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + units = new ArrayList(1); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 3, 4, null)); + + uas.add(new UnitaryAlignment(units, creators)); + + annotations.addAll(units); + + Alignment a = new Alignment(uas, new AnnotationSet(annotations)); + + double averageAnnotations = 4 / 3.0; + double disorder_1 = 0; + double disorder_2 = 2 / 3.0; + + assertThat(a.getDisorder(new NominalFeatureDissimilarity())) + .isCloseTo((disorder_1 + disorder_2) / averageAnnotations, offset(0.0001)); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureDissimilarityTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureDissimilarityTest.java new file mode 100644 index 0000000..358ed13 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureDissimilarityTest.java @@ -0,0 +1,58 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureDissimilarity; +import org.junit.jupiter.api.Test; + +public class NominalFeatureDissimilarityTest +{ + private static final Rater ANNOTATOR_1 = new Rater("1", 0); + private static final Rater ANNOTATOR_2 = new Rater("2", 1); + + @Test + void testDissimilarity() + { + var diss = new NominalFeatureDissimilarity(); + + var u = new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null); + var v = new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null); + assertThat(diss.dissimilarity(u, v)).isEqualTo(0.0); + + u = new AlignableAnnotationUnit(ANNOTATOR_1, "textunit", 0, 3, null); + v = new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + u = new AlignableAnnotationUnit(ANNOTATOR_1, null, 3, 4, null); + v = new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + v = u.cloneWithDifferentLabel("pos", "A"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + u = v; + assertThat(diss.dissimilarity(u, v)).isEqualTo(0.0); + + assertThat(diss.dissimilarity(u, null)).isEqualTo(1.0); + assertThat(diss.dissimilarity(null, v)).isEqualTo(1.0); + + assertThat(diss.dissimilarity(null, null)).isEqualTo(0.0); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureTextDissimilarityTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureTextDissimilarityTest.java new file mode 100644 index 0000000..637724a --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/NominalFeatureTextDissimilarityTest.java @@ -0,0 +1,66 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AlignableAnnotationTextUnit; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.dissimilarity.IDissimilarity; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureTextDissimilarity; +import org.junit.jupiter.api.Test; + +public class NominalFeatureTextDissimilarityTest +{ + private static final Rater ANNOTATOR_1 = new Rater("1", 0); + private static final Rater ANNOTATOR_2 = new Rater("2", 1); + + @Test + void testDissimilarity() + { + IDissimilarity diss = new NominalFeatureTextDissimilarity(); + + AlignableAnnotationUnit u = new AlignableAnnotationTextUnit(ANNOTATOR_1, 0, 3, "Test"); + AlignableAnnotationUnit v = new AlignableAnnotationTextUnit(ANNOTATOR_2, 0, 3, "Test"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(0.0); + + u = new AlignableAnnotationTextUnit(ANNOTATOR_1, 0, 3, "Test"); + v = new AlignableAnnotationTextUnit(ANNOTATOR_2, 0, 3, "Tester"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + u = new AlignableAnnotationTextUnit(ANNOTATOR_1, 3, 4, "Test"); + v = new AlignableAnnotationTextUnit(ANNOTATOR_2, 0, 3, "Test"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + v = u.cloneWithDifferentLabel("pos", "A"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(0.5); + + u = new AlignableAnnotationTextUnit(ANNOTATOR_1, 3, 4, "Tester"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(1.0); + + u = new AlignableAnnotationTextUnit(ANNOTATOR_1, 0, 3, "Tester"); + assertThat(diss.dissimilarity(u, v)).isEqualTo(2.0); + + u = v; + assertThat(diss.dissimilarity(u, v)).isEqualTo(0.0); + + assertThat(diss.dissimilarity(u, null)).isEqualTo(1.0); + assertThat(diss.dissimilarity(null, v)).isEqualTo(1.0); + + assertThat(diss.dissimilarity(null, null)).isEqualTo(0.0); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignmentTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignmentTest.java new file mode 100644 index 0000000..c783eda --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/PairwiseDPTextAlignmentTest.java @@ -0,0 +1,241 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + + +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.List; + +import org.junit.jupiter.api.Test; + +public class PairwiseDPTextAlignmentTest +{ + @Test + public void testGetAlignments() + { + char[] source = new char[5]; + source[0] = 't'; + source[1] = 'h'; + source[2] = 'e'; + source[3] = 'i'; + source[4] = 'r'; + char[] target = new char[5]; + target[0] = 't'; + target[1] = 'h'; + target[2] = 'e'; + target[3] = 'r'; + target[4] = 'e'; + ITextAlignment ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + List alignments = ta.getAlignments(); + assertThat(alignments).hasSize(1); + + source = new char[10]; + source[0] = '{'; + source[1] = 'a'; + source[2] = '}'; + source[3] = '{'; + source[4] = 'b'; + source[5] = 'b'; + source[6] = 'c'; + source[7] = 'c'; + source[8] = 'c'; + source[9] = '}'; + target = new char[10]; + target[0] = '{'; + target[1] = 'a'; + target[2] = 'b'; + target[3] = 'b'; + target[4] = '}'; + target[5] = '{'; + target[6] = 'c'; + target[7] = 'c'; + target[8] = 'c'; + target[9] = '}'; + + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + alignments = ta.getAlignments(); + // alignment method excludes alignments where units are not aligned + // if an optimal alignment exists where units are aligned. + // this is returned: + // {a--}{bbccc} + // {abb}{--ccc} + // this is excluded: + // {a}{bb--ccc} + // {a--bb}{ccc} + + assertThat(alignments).hasSize(1); + + source = new char[9]; + source[0] = '{'; + source[1] = 'a'; + source[2] = '}'; + source[3] = '{'; + source[4] = 'b'; + source[5] = 'c'; + source[6] = 'c'; + source[7] = 'c'; + source[8] = '}'; + target = new char[9]; + target[0] = '{'; + target[1] = 'a'; + target[2] = 'b'; + target[3] = '}'; + target[4] = '{'; + target[5] = 'c'; + target[6] = 'c'; + target[7] = 'c'; + target[8] = '}'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + alignments = ta.getAlignments(); + assertThat(alignments).hasSize(1); + + // Test example with many optimal alignments + // alignment method only returns alignments with aligned tokens + // (i.e. 4 in the example below) + source = new char[] { '{', 't', '}', '{', 't', '}', '{', 't', '}', '{', 't', '}' }; + target = new char[] { '{', 'a', 'a', 'a', 'a', '}' }; + + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + alignments = ta.getAlignments(); + assertThat(alignments).hasSize(4); + + ta = new PairwiseDPTextAlignment(target, source, '-', '{', '}'); + alignments = ta.getAlignments(); + assertThat(alignments).hasSize(4); + } + + @Test + public void testGetInsertions() + { + char[] source = { 'T', 'e' }; + char[] target = { 'T' }; + ITextAlignment ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getInsertions()).isEqualTo(0); + + source = new char[3]; + source[0] = 'T'; + source[1] = 'e'; + source[2] = 's'; + target = new char[4]; + target[0] = 'T'; + target[1] = 'e'; + target[2] = 's'; + target[3] = 't'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getInsertions()).isEqualTo(1); + + source = new char[5]; + source[0] = 't'; + source[1] = 'h'; + source[2] = 'e'; + source[3] = 'i'; + source[4] = 'r'; + target = new char[5]; + target[0] = 't'; + target[1] = 'h'; + target[2] = 'e'; + target[3] = 'r'; + target[4] = 'e'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getInsertions()).isEqualTo(1); + } + + @Test + public void testGetDeletions() + { + char[] source = { 'T', 'e' }; + char[] target = { 'T' }; + ITextAlignment ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getDeletions()).isEqualTo(1); + + source = new char[3]; + source[0] = 'T'; + source[1] = 'e'; + source[2] = 's'; + target = new char[4]; + target[0] = 'T'; + target[1] = 'e'; + target[2] = 's'; + target[3] = 't'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getDeletions()).isEqualTo(0); + + source = new char[5]; + source[0] = 't'; + source[1] = 'h'; + source[2] = 'e'; + source[3] = 'i'; + source[4] = 'r'; + target = new char[5]; + target[0] = 't'; + target[1] = 'h'; + target[2] = 'e'; + target[3] = 'r'; + target[4] = 'e'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getDeletions()).isEqualTo(1); + } + + @Test + public void testGetSubstitutions() + { + + // substitutions are not allowed - always 0 + + char[] source = { 'T', 'e' }; + char[] target = { 'T' }; + ITextAlignment ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getSubstitutions()).isEqualTo(0); + + source = new char[3]; + source[0] = 'T'; + source[1] = 'e'; + source[2] = 's'; + target = new char[4]; + target[0] = 'T'; + target[1] = 'e'; + target[2] = 'r'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getSubstitutions()).isEqualTo(0); + + source = new char[5]; + source[0] = 't'; + source[1] = 'h'; + source[2] = 'e'; + source[3] = 'i'; + source[4] = 'r'; + target = new char[5]; + target[0] = 't'; + target[1] = 'h'; + target[2] = 'e'; + target[3] = 'r'; + target[4] = 'e'; + ta = new PairwiseDPTextAlignment(source, target, '-', '{', '}'); + + assertThat(ta.getSubstitutions()).isEqualTo(0); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignmentTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignmentTest.java new file mode 100644 index 0000000..b7a63b6 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/alignment/UnitaryAlignmentTest.java @@ -0,0 +1,236 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.alignment; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; +import static org.assertj.core.api.Assertions.offset; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.dkpro.statistics.agreement.aligning.data.AnnotationSet; +import org.dkpro.statistics.agreement.aligning.data.Rater; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureDissimilarity; +import org.junit.jupiter.api.Test; + +public class UnitaryAlignmentTest +{ + private static final Rater ANNOTATOR_1 = new Rater("1", 0); + private static final Rater ANNOTATOR_2 = new Rater("2", 1); + private static final Rater ANNOTATOR_3 = new Rater("3", 1); + + @Test + public void testCreationWithAnnotationsFromOneCreator() + { + var units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 4, null)); + + var as = new AnnotationSet(units); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new UnitaryAlignment(as.getUnits(), new HashSet(as.getRaters())); + }); + } + + @Test + public void testCreationWithAnnotationsFromUnknownCreator() + { + var units = asList( // + new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 4, null)); + + var as = new AnnotationSet(units); + + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new UnitaryAlignment(as.getUnits(), + new HashSet(asList(as.getRaters().iterator().next()))); + }); + } + + @Test + public void testArity() + { + var units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + var as = new AnnotationSet(units); + + var a = new UnitaryAlignment(as.getUnits(), as.getRaters()); + assertThat(a.arity()).isEqualTo(2); + } + + @Test + public void testGetDisorder() + { + var units = new ArrayList(3); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + var as = new AnnotationSet(units); + + var a = new UnitaryAlignment(as.getUnits(), as.getRaters()); + assertThat(a.getDisorder(new NominalFeatureDissimilarity())).isEqualTo(0.0); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 3, 4, null)); + + as = new AnnotationSet(units); + + a = new UnitaryAlignment(as.getUnits(), as.getRaters()); + assertThat(a.getDisorder(new NominalFeatureDissimilarity())).isCloseTo(2 / 3.0, + offset(0.0001)); + + units = new ArrayList(1); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 3, 4, null)); + + a = new UnitaryAlignment(units, as.getRaters()); + assertThat(a.getDisorder(new NominalFeatureDissimilarity())).isCloseTo(2 / 3.0, + offset(0.0001)); + } + + @Test + public void testEqualsObject() + { + var units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + var annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_2); + + UnitaryAlignment a = new UnitaryAlignment(units, annotators); + + annotators.add(ANNOTATOR_3); + + UnitaryAlignment b = new UnitaryAlignment(units, annotators); + + // different arity + assertThat(a).isNotEqualTo(b); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 0, 3, null)); + + annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_3); + + UnitaryAlignment c = new UnitaryAlignment(units, annotators); + + // different annotators + assertThat(a).isNotEqualTo(c); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 2, null)); + + annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_2); + + UnitaryAlignment d = new UnitaryAlignment(units, annotators); + + // different units + assertThat(a).isNotEqualTo(d); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + UnitaryAlignment e = new UnitaryAlignment(units, annotators); + + // equal alignments + assertThat(a).isEqualTo(e); + } + + @Test + public void testCompareToUnitaryAlignment() + { + + List units = new ArrayList(2); + + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + Set annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_2); + + UnitaryAlignment a = new UnitaryAlignment(units, annotators); + + annotators.add(ANNOTATOR_3); + + UnitaryAlignment b = new UnitaryAlignment(units, annotators); + + // different arity + assertThat(a.compareTo(b)).isEqualTo(-1); + assertThat(b.compareTo(a)).isEqualTo(1); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_3, null, 0, 3, null)); + + annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_3); + + UnitaryAlignment c = new UnitaryAlignment(units, annotators); + + // different annotators + assertThat(a.compareTo(c)).isEqualTo(-1); + assertThat(c.compareTo(a)).isEqualTo(1); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 2, null)); + + annotators = new HashSet(); + + annotators.add(ANNOTATOR_1); + annotators.add(ANNOTATOR_2); + + UnitaryAlignment d = new UnitaryAlignment(units, annotators); + + // different units + assertThat(d.compareTo(a)).isEqualTo(-1); + assertThat(a.compareTo(d)).isEqualTo(1); + + units.clear(); + units.add(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + units.add(new AlignableAnnotationUnit(ANNOTATOR_2, null, 0, 3, null)); + + UnitaryAlignment e = new UnitaryAlignment(units, annotators); + + // equal alignments + assertThat(a.compareTo(e)).isEqualTo(0); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMergeTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMergeTest.java new file mode 100644 index 0000000..db2bb4e --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotatedTextMergeTest.java @@ -0,0 +1,128 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; +import static org.dkpro.statistics.agreement.aligning.data.AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation; + +import org.dkpro.statistics.agreement.aligning.alignment.Alignment; +import org.dkpro.statistics.agreement.aligning.dissimilarity.NominalFeatureTextDissimilarity; +import org.junit.jupiter.api.Test; + +class AnnotatedTextMergeTest +{ + private static final Rater ANNOTATOR_A = new Rater("A", 0); + private static final Rater ANNOTATOR_B = new Rater("B", 1); + + @Test + void testMergeAnnotatedTextsWithSegmentation() + { + // Case 1: identical texts + var annots1 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 0, 4, "kauf"), // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 4, 8, "mann")); + var text1 = new AnnotatedText("kaufmann", annots1); + + var annots2 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 0, 4, "kauf"), // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 4, 8, "mann")); + var text2 = new AnnotatedText("kaufmann", annots2); + + var harm = mergeAnnotatedTextsWithSegmentation(text1, text2); + + assertThat(harm).hasSize(1); + Alignment al = harm.toArray(new Alignment[0])[0]; + assertThat(al.getDisorder(new NominalFeatureTextDissimilarity())).isEqualTo(0); + + // Case 2: Alignable but different TextUnits + annots1 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 0, 5, "kauff"), + new AlignableAnnotationTextUnit(ANNOTATOR_A, 5, 9, "mann")); + text1 = new AnnotatedText("kauffmann", annots1); + + annots2 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 0, 4, "kauf"), // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 4, 9, "mmann")); + text2 = new AnnotatedText("kaufmmann", annots2); + + harm = AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation(text1, text2); + + assertThat(harm).hasSize(1); + al = harm.toArray(new Alignment[0])[0]; + assertThat(al.getDisorder(new NominalFeatureTextDissimilarity())).isEqualTo(1); + + // Case 3: segmentation completely different + annots1 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 0, 1, "k"), // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 1, 8, "aufmann")); + text1 = new AnnotatedText("kaufmann", annots1); + + annots2 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 0, 4, "kauf"), // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 4, 8, "mann")); + text2 = new AnnotatedText("kaufmann", annots2); + + harm = AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation(text1, text2); + + assertThat(harm).hasSize(1); + al = harm.toArray(new Alignment[0])[0]; + assertThat(al.getDisorder(new NominalFeatureTextDissimilarity())).isEqualTo(2); + + // Case 4: two possible alignments + annots1 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 0, 5, "a"), // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 6, 11, "b")); + text1 = new AnnotatedText("sonst sonst", annots1); + + annots2 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 0, 5, "a")); + text2 = new AnnotatedText("sonst", annots2); + + harm = AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation(text1, text2); + + assertThat(harm).hasSize(2); + + // for (int i=0; i < harm.size(); i++) { + // System.out.println(harm.toArray(new Alignment[0])[i]); + // System.out.println(harm.toArray(new Alignment[0])[i].getDisorder(new + // SimpleTextUnitDissimilarity())); + // } + + // Case 5: three possible alignments + annots1 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_A, 0, 2, "a"), + new AlignableAnnotationTextUnit(ANNOTATOR_A, 3, 5, "b"), + new AlignableAnnotationTextUnit(ANNOTATOR_A, 6, 8, "c")); + text1 = new AnnotatedText("so so so", annots1); + + annots2 = asList( // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 0, 2, "b"), // + new AlignableAnnotationTextUnit(ANNOTATOR_B, 3, 5, "c")); + text2 = new AnnotatedText("so so", annots2); + + harm = AnnotatedTextMerge.mergeAnnotatedTextsWithSegmentation(text1, text2); + + assertThat(harm).hasSize(3); + + // for (int i=0; i < harm.size(); i++) { + // System.out.println(harm.toArray(new Alignment[0])[i]); + // System.out.println(harm.toArray(new Alignment[0])[i].getDisorder(new + // SimpleTextUnitDissimilarity())); + // } + } + +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSetTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSetTest.java new file mode 100644 index 0000000..c56a7b6 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/AnnotationSetTest.java @@ -0,0 +1,150 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static java.util.Arrays.asList; +import static org.assertj.core.api.Assertions.assertThat; + +import java.util.Arrays; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.junit.jupiter.api.Test; + +class AnnotationSetTest +{ + private static final Rater ANNOTATOR_A = new Rater("A", 0); + private static final Rater ANNOTATOR_B = new Rater("B", 1); + + @Test + public void testGetNumberOfAnnotators() + { + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 2, null))); + + assertThat(set.getRaterCount()).isEqualTo(2); + } + + @Test + public void testGetNumberOfAnnotations() + { + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null))); + + assertThat(set.getUnitCount()).isEqualTo(2); + } + + @Test + public void testGetAverageNumberOfAnnotations() + { + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 2, null))); + + assertThat(set.getAverageNumberOfAnnotations()).isEqualTo(1.5); + } + + @Test + public void testGetAnnotators() + { + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 2, null))); + + assertThat(set.getRaters()) // + .containsExactlyInAnyOrder(ANNOTATOR_A, ANNOTATOR_B); + } + + @Test + public void testGetUnits() + { + var annots = new AlignableAnnotationUnit[] { // + new AlignableAnnotationUnit(null, 2, 3, null), // + new AlignableAnnotationUnit(null, 1, 2, null) }; + + var set = new AnnotationSet(asList(annots)); + + assertThat(Arrays.equals( // + set.getUnits().toArray(AlignableAnnotationUnit[]::new), annots)).isFalse(); + Arrays.sort(annots); + assertThat(set.getUnits()).containsExactly(annots); + } + + @Test + public void testGetAnnotationsType() + { + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(null, "A", 2, 3, null), // + new AlignableAnnotationUnit(null, "", 1, 2, null), // + new AlignableAnnotationUnit(null, null, 2, 3, null))); + + assertThat(set.getUnitsWithType("A")).hasSize(1); + assertThat(set.getUnitsWithType("")).hasSize(2); + assertThat(set.getUnitsWithType("A")).isNotEqualTo(set.getUnitsWithType("")); + } + + @Test + public void testGetAnnotationsCreator() + { + var a = ANNOTATOR_A; + var b = new Rater("", -1); + + var set = new AnnotationSet(asList( // + new AlignableAnnotationUnit(a, null, 2, 3, null), // + new AlignableAnnotationUnit(b, null, 1, 2, null), // + new AlignableAnnotationUnit(null, null, 2, 3, null))); + + assertThat(set.getUnitsWithRater(a)).hasSize(1); + assertThat(set.getUnitsWithRater(b)).hasSize(2); + assertThat(set.getUnitsWithRater(a)).isNotEqualTo(set.getUnitsWithRater(b)); + } + + @Test + public void testEquals() + { + var annots = asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 2, null)); + + var set1 = new AnnotationSet(annots); + var set2 = new AnnotationSet(annots); + + var set3 = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 2, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null))); + + var set4 = new AnnotationSet(asList( // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 3, null), // + new AlignableAnnotationUnit(ANNOTATOR_B, 1, 2, null), // + new AlignableAnnotationUnit(ANNOTATOR_A, 1, 2, null))); + + assertThat(set1).isEqualTo(set1); + assertThat(set1).isEqualTo(set2); + assertThat(set2).isEqualTo(set1); + assertThat(set1).isNotEqualTo(null); + assertThat(set1).isNotEqualTo(set3); + assertThat(set3).isNotEqualTo(set1); + assertThat(set1).isNotEqualTo(set4); + assertThat(set4).isNotEqualTo(set1); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/TextUnitTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/TextUnitTest.java new file mode 100644 index 0000000..f6fe182 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/TextUnitTest.java @@ -0,0 +1,40 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static org.assertj.core.api.Assertions.assertThat; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.junit.jupiter.api.Test; + +class TextUnitTest +{ + @Test + void testEqualsObject() + { + var tok1 = new AlignableAnnotationTextUnit(null, 0, 1, "a"); + var tok2 = new AlignableAnnotationTextUnit(null, 0, 1, "a"); + var tok3 = new AlignableAnnotationTextUnit(null, 0, 1, null); + + assertThat(tok1).isEqualTo(tok2); + assertThat(tok2).isNotEqualTo(tok3); + + var tok4 = new AlignableAnnotationUnit(null, "textunit", 0, 1, null); + + assertThat(tok1).isNotEqualTo(tok4); + assertThat(tok4).isNotEqualTo(tok1); + } +} diff --git a/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/UnitTest.java b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/UnitTest.java new file mode 100644 index 0000000..7c63922 --- /dev/null +++ b/dkpro-statistics-agreement/src/test/java/org/dkpro/statistics/agreement/aligning/data/UnitTest.java @@ -0,0 +1,145 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Original source: https://github.com/fab-bar/TextGammaTool.git + */ +package org.dkpro.statistics.agreement.aligning.data; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatExceptionOfType; + +import java.util.HashMap; + +import org.dkpro.statistics.agreement.aligning.AlignableAnnotationUnit; +import org.junit.jupiter.api.Test; + +public class UnitTest +{ + private static final Rater ANNOTATOR_1 = new Rater("1", 0); + private static final Rater ANNOTATOR_2 = new Rater("2", 1); + + @Test + public void createUnitBeginBiggerEnd() + { + assertThatExceptionOfType(IllegalArgumentException.class).isThrownBy(() -> { + new AlignableAnnotationUnit(null, 1, 0, null); + }); + } + + @Test + public void testEqualsObject() + { + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, 0, 3, null)) + .isEqualTo(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null)); + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, "Test", 0, 3, null)) + .isEqualTo(new AlignableAnnotationUnit(ANNOTATOR_1, "Test", 0, 3, null)); + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, "Test", 0, 3, null)) + .isNotEqualTo(new AlignableAnnotationUnit(null, "Test", 0, 3, null)); + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null)) + .isNotEqualTo(new AlignableAnnotationUnit(null, "Test", 0, 3, null)); + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null)) + .isNotEqualTo(new AlignableAnnotationUnit(null, null, 1, 3, null)); + } + + @Test + public void testOverlaps() + { + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null) + .overlaps(new AlignableAnnotationUnit(null, null, 2, 4, null))).isTrue(); + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null) + .overlaps(new AlignableAnnotationUnit(null, null, 0, 3, null))).isTrue(); + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null) + .overlaps(new AlignableAnnotationUnit(null, null, 3, 5, null))).isFalse(); + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null) + .overlaps(new AlignableAnnotationUnit(null, null, 4, 5, null))).isFalse(); + assertThat(new AlignableAnnotationUnit(null, null, 4, 5, null) + .overlaps(new AlignableAnnotationUnit(null, null, 0, 3, null))).isFalse(); + assertThat(new AlignableAnnotationUnit(null, null, 4, 5, null) + .overlaps(new AlignableAnnotationUnit(null, null, 0, 4, null))).isFalse(); + } + + @Test + public void testCompareToUnit() + { + assertThat(new AlignableAnnotationUnit(null, null, 0, 3, null)) + .isLessThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, null, 3, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 3, null)) + .isLessThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 3, null)); + + assertThat(new AlignableAnnotationUnit(null, "a", 2, 4, null)) + .isLessThan(new AlignableAnnotationUnit(null, "b", 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, "b", 2, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(null, "a", 2, 4, null)); + + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, null)) + .isLessThan(new AlignableAnnotationUnit(null, "b", 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, "a", 2, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, null, 2, 4, null)) + .isLessThan(new AlignableAnnotationUnit(ANNOTATOR_2, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(ANNOTATOR_2, null, 2, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(ANNOTATOR_1, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, null)) + .isLessThan(new AlignableAnnotationUnit(ANNOTATOR_2, null, 2, 4, null)); + assertThat(new AlignableAnnotationUnit(ANNOTATOR_2, null, 2, 4, null)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + + HashMap a1 = new HashMap(); + HashMap a2 = new HashMap(); + HashMap b = new HashMap(); + + a1.put("a", "a"); + a2.put("a", "b"); + b.put("b", "a"); + + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, a1)) + .isLessThan(new AlignableAnnotationUnit(null, null, 2, 4, b)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, b)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, a1)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, a1)) + .isLessThan(new AlignableAnnotationUnit(null, null, 2, 4, a2)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, a2)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, a1)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, null)) + .isLessThan(new AlignableAnnotationUnit(null, null, 2, 4, a1)); + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, a1)) + .isGreaterThan(new AlignableAnnotationUnit(null, null, 2, 4, null)); + + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, null) + .compareTo(new AlignableAnnotationUnit(null, null, 2, 4, null))).isZero(); + + } + + @Test + public void testHash() + { + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, 0, 3, null).hashCode()) + .isEqualTo(new AlignableAnnotationUnit(ANNOTATOR_1, null, 0, 3, null).hashCode()); + assertThat(new AlignableAnnotationUnit(ANNOTATOR_1, "Test", 0, 3, null).hashCode()) + .isEqualTo(new AlignableAnnotationUnit(ANNOTATOR_1, "Test", 0, 3, null).hashCode()); + + var a1 = new HashMap(); + var a2 = new HashMap(); + + a1.put("a", "b"); + a2.put("a", "b"); + + assertThat(new AlignableAnnotationUnit(null, null, 2, 4, a1).hashCode()) + .isEqualTo(new AlignableAnnotationUnit(null, null, 2, 4, a2).hashCode()); + } +} diff --git a/dkpro-statistics-agreement/src/test/resources/simplelogger.properties b/dkpro-statistics-agreement/src/test/resources/simplelogger.properties new file mode 100644 index 0000000..ed7e3ff --- /dev/null +++ b/dkpro-statistics-agreement/src/test/resources/simplelogger.properties @@ -0,0 +1,34 @@ +# SLF4J's SimpleLogger configuration file +# Simple implementation of Logger that sends all enabled log messages, for all defined loggers, to System.err. + +# Default logging detail level for all instances of SimpleLogger. +# Must be one of ("trace", "debug", "info", "warn", or "error"). +# If not specified, defaults to "info". +# org.slf4j.simpleLogger.defaultLogLevel=TRACE + +# Logging detail level for a SimpleLogger instance named "xxxxx". +# Must be one of ("trace", "debug", "info", "warn", or "error"). +# If not specified, the default logging detail level is used. +#org.slf4j.simpleLogger.log.xxxxx= + +# Set to true if you want the current date and time to be included in output messages. +# Default is false, and will output the number of milliseconds elapsed since startup. +#org.slf4j.simpleLogger.showDateTime=false + +# The date and time format to be used in the output messages. +# The pattern describing the date and time format is the same that is used in java.text.SimpleDateFormat. +# If the format is not specified or is invalid, the default format is used. +# The default format is yyyy-MM-dd HH:mm:ss:SSS Z. +#org.slf4j.simpleLogger.dateTimeFormat=yyyy-MM-dd HH:mm:ss:SSS Z + +# Set to true if you want to output the current thread name. +# Defaults to true. +#org.slf4j.simpleLogger.showThreadName=true + +# Set to true if you want the Logger instance name to be included in output messages. +# Defaults to true. +#org.slf4j.simpleLogger.showLogName=true + +# Set to true if you want the last component of the name to be included in output messages. +# Defaults to false. +#org.slf4j.simpleLogger.showShortLogName=false diff --git a/pom.xml b/pom.xml index eec3d55..a7b13c3 100644 --- a/pom.xml +++ b/pom.xml @@ -88,7 +88,7 @@ org.apache.commons commons-math3 - 3.2 + 3.6.1 org.apache.commons