Skip to content

Commit

Permalink
Refs #647: refactors exact match into only one object
Browse files Browse the repository at this point in the history
  • Loading branch information
janehmueller committed Jun 6, 2018
1 parent 8d2faf4 commit 241c7f3
Show file tree
Hide file tree
Showing 9 changed files with 18 additions and 31 deletions.
4 changes: 2 additions & 2 deletions src/main/resources/configs/deduplication_dbpedia.xml
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
<attribute>
<key>id_dbpedia</key>
<feature>
<similarityMeasure>ExactMatchString</similarityMeasure>
<similarityMeasure>ExactMatch</similarityMeasure>
<scale>1</scale>
</feature>
</attribute>
<attribute>
<key>id_wikidata</key>
<feature>
<similarityMeasure>ExactMatchString</similarityMeasure>
<similarityMeasure>ExactMatch</similarityMeasure>
<scale>1</scale>
</feature>
</attribute>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,14 @@ package de.hpi.ingestion.deduplication.similarity

/**
* An abstract binary similarity measure for exact matching
* @tparam T the type of data to be compared
*/
abstract class ExactMatch[T] extends SimilarityMeasure[T] {
object ExactMatch extends SimilarityMeasure[Any] {
/**
* Comparing the given objects on exact matching
* @param x object to be compared to y
* @param y object to be compared to x
* @param u has no specific use in here
* @return 1.0 if given objects match exactly, 0.0 otherwise
*/
override def compare(x: T, y: T, u: Int = 1) = if(x == y) 1.0 else 0.0
override def compare(x: Any, y: Any, u: Int = 1) = if(x == y) 1.0 else 0.0
}

/**
* A specific exact match similarity measure comparing strings
*/
object ExactMatchString extends ExactMatch[String]

/**
* A specific exact match similarity measure comparing Doubles
*/
object ExactMatchDouble extends ExactMatch[Double]
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ package de.hpi.ingestion.deduplication.similarity
* Provides a method to measure the similarity of two objects
* @tparam T the type of the objects to be compared
*/
trait SimilarityMeasure[T] extends Serializable {
trait SimilarityMeasure[-T] extends Serializable {

/**
* Calculates a similarity score for two objects
Expand All @@ -38,8 +38,7 @@ trait SimilarityMeasure[T] extends Serializable {
*/
object SimilarityMeasure {
val dataTypes: Map[String, SimilarityMeasure[_]] = Map(
"ExactMatchString" -> ExactMatchString,
"ExactMatchDouble" -> ExactMatchDouble,
"ExactMatch" -> ExactMatch,
"MongeElkan" -> MongeElkan,
"Jaccard" -> Jaccard,
"DiceSorensen" -> DiceSorensen,
Expand All @@ -60,6 +59,6 @@ object SimilarityMeasure {
* @return the requested Similarity Measure if it exists or else Exact Match String as default
*/
def get[T](similarityMeasure: String): SimilarityMeasure[T] = {
dataTypes.getOrElse(similarityMeasure, ExactMatchString).asInstanceOf[SimilarityMeasure[T]]
dataTypes.getOrElse(similarityMeasure, ExactMatch).asInstanceOf[SimilarityMeasure[T]]
}
}
2 changes: 1 addition & 1 deletion src/test/resources/defaultDeduplication
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
</item>
<item>
<attribute>name</attribute>
<similartyMeasure>ExactMatchString</similartyMeasure>
<similartyMeasure>ExactMatch</similartyMeasure>
<weight>0.2</weight>
<scale>1</scale>
</item>
Expand Down
2 changes: 1 addition & 1 deletion src/test/resources/framework/test3.xml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
<attribute>
<key>category</key>
<feature>
<similarityMeasure>ExactMatchString</similarityMeasure>
<similarityMeasure>ExactMatch</similarityMeasure>
<scale>1</scale>
</feature>
</attribute>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,12 @@ package de.hpi.ingestion.deduplication
import com.holdenkarau.spark.testing.{RDDComparisons, SharedSparkContext}
import de.hpi.ingestion.deduplication.models.FeatureEntry
import de.hpi.ingestion.deduplication.models.config.SimilarityMeasureConfig
import de.hpi.ingestion.deduplication.similarity.{ExactMatchString, SimilarityMeasure}
import de.hpi.ingestion.deduplication.similarity.{ExactMatch, SimilarityMeasure}
import org.scalatest.{FlatSpec, Matchers}

class FeatureCalculationTest extends FlatSpec with Matchers with SharedSparkContext with RDDComparisons {
"compare" should "calculate a similarity score of two subjects from a given config" in {
val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatchString, 1.0)
val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatch, 1.0)
val attribute = "geo_city"
val subject = TestData.subjects.head.get(attribute)
val staging = TestData.stagings.head.get(attribute)
Expand All @@ -35,7 +35,7 @@ class FeatureCalculationTest extends FlatSpec with Matchers with SharedSparkCont
}

it should "return 0.0 if one of the given subjects doesn't hold a property" in {
val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatchString, 1.0)
val config = SimilarityMeasureConfig[String, SimilarityMeasure[String]](ExactMatch, 1.0)
val attribute = "geo_city"
val subject = TestData.subjects.head.get(attribute)
val staging = TestData.subjects.last.get(attribute)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ class ExactMatchUnitTest extends FlatSpec with Matchers {
("context", "context", 1.0))

testData.foreach(tuple =>
ExactMatchString.compare(tuple._1, tuple._2) shouldEqual tuple._3)
ExactMatch.compare(tuple._1, tuple._2) shouldEqual tuple._3)
}

it should "return 1.0 or 0.0 for given doubles" in {
Expand All @@ -35,6 +35,6 @@ class ExactMatchUnitTest extends FlatSpec with Matchers {
(0.2, 0.4, 0.0))

testData.foreach(tuple =>
ExactMatchDouble.compare(tuple._1, tuple._2) shouldEqual tuple._3)
ExactMatch.compare(tuple._1, tuple._2) shouldEqual tuple._3)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,7 @@ import org.scalatest.{FlatSpec, Matchers}
class SimilarityMeasureTest extends FlatSpec with Matchers {

"Similarity Measure" should "be returned given its name" in {
SimilarityMeasure.get[String]("ExactMatchString") shouldEqual ExactMatchString
SimilarityMeasure.get[Double]("ExactMatchDouble") shouldEqual ExactMatchDouble
SimilarityMeasure.get[String]("ExactMatch") shouldEqual ExactMatch
SimilarityMeasure.get[String]("MongeElkan") shouldEqual MongeElkan
SimilarityMeasure.get[String]("Jaccard") shouldEqual Jaccard
SimilarityMeasure.get[String]("DiceSorensen") shouldEqual DiceSorensen
Expand All @@ -32,6 +31,6 @@ class SimilarityMeasureTest extends FlatSpec with Matchers {
SimilarityMeasure.get[String]("Overlap") shouldEqual Overlap
SimilarityMeasure.get[String]("EuclidianDistance") shouldEqual EuclidianDistance
SimilarityMeasure.get[String]("RelativeNumbersSimilarity") shouldEqual RelativeNumbersSimilarity
SimilarityMeasure.get[String]("Not existing") shouldEqual ExactMatchString
SimilarityMeasure.get[Any]("Not existing") shouldEqual ExactMatch
}
}
4 changes: 2 additions & 2 deletions src/test/scala/de/hpi/ingestion/framework/TestData.scala
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ limitations under the License.
package de.hpi.ingestion.framework

import de.hpi.ingestion.deduplication.models.config.{AttributeConfig, SimilarityMeasureConfig}
import de.hpi.ingestion.deduplication.similarity.{ExactMatchString, JaroWinkler, MongeElkan}
import de.hpi.ingestion.deduplication.similarity.{ExactMatch, JaroWinkler, MongeElkan, SimilarityMeasure}

import scala.xml.{Node, XML}

Expand Down Expand Up @@ -48,7 +48,7 @@ object TestData {
"category",
0.5,
List(
SimilarityMeasureConfig(similarityMeasure = ExactMatchString, weight = 1.0)
SimilarityMeasureConfig(similarityMeasure = ExactMatch, weight = 1.0)
)
)
)
Expand Down

0 comments on commit 241c7f3

Please sign in to comment.