Skip to content

Commit

Permalink
Repaired Dice Coeffient (issue #603) (#605)
Browse files Browse the repository at this point in the history
* Repaired Dice Coeffient  (issue #603)

* Lint corrections

* Lint corrections
  • Loading branch information
Hugo-ter-Doest authored Jul 26, 2021
1 parent 173a4a8 commit 8b17010
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 48 deletions.
77 changes: 29 additions & 48 deletions lib/natural/distance/dice_coefficient.js
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
Copyright (c) 2011, John Crepezzi, Chris Umbel
Copyright (c) 2021, Hugo W.L. ter Doest
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
Expand All @@ -22,63 +22,44 @@ THE SOFTWARE.

'use strict'

// Get all of the pairs of letters for a string
const letterPairs = function (str) {
if (str.length === 0) {
return []
function getBigrams (str) {
let str1 = str
// pad with a space if str consists of one character
if (str.length === 1) {
str1 = str + ' '
}
const numPairs = str.length - 1
const pairs = new Array(numPairs)
for (let i = 0; i < numPairs; i++) {
pairs[i] = str.substring(i, i + 2)
const bigrams = new Set()
const length = str1.length
for (let i = 0; i < length - 1; i++) {
const bigram = str1.slice(i, i + 2)
bigrams.add(bigram)
}
return pairs
return bigrams
}

// Get all of the pairs in all of the words for a string
const wordLetterPairs = function (str) {
const allPairs = []; let pairs
const words = str.split(/\s+/)
for (let i = 0; i < words.length; i++) {
pairs = letterPairs(words[i])
allPairs.push.apply(allPairs, pairs)
}
return allPairs
function intersect (set1, set2) {
const intersection = new Set()
set1.forEach(value => {
if (set2.has(value)) {
intersection.add(value)
}
})
return intersection
}

// Perform some sanitization steps
const sanitize = function (str) {
return str.toLowerCase().replace(/^\s+|\s+$/g, '')
function sanitize (str) {
// Turn characters to lower string, remove space at the beginning and end,
// replace multiple spaces in the middle by single spaces
return str.toLowerCase().replace(/^\s+|\s+$/g, '').replace(/s+/g, ' ')
}

// Compare two strings, and spit out a number from 0-1
const compare = function (str1, str2) {
function diceCoefficient (str1, str2) {
const sanitizedStr1 = sanitize(str1)
const sanitizedStr2 = sanitize(str2)
const pairs1 = wordLetterPairs(sanitizedStr1)
const pairs2 = wordLetterPairs(sanitizedStr2)
let intersection = 0; const union = pairs1.length + pairs2.length
if (union === 0) {
if (sanitizedStr1 === sanitizedStr2) {
return 1
} else {
return 0
}
} else {
let i, j, pair1, pair2
for (i = 0; i < pairs1.length; i++) {
pair1 = pairs1[i]
for (j = 0; j < pairs2.length; j++) {
pair2 = pairs2[j]
if (pair1 === pair2) {
intersection++
delete pairs2[j]
break
}
}
}
return 2 * intersection / union
}
const bigrams1 = getBigrams(sanitizedStr1)
const bigrams2 = getBigrams(sanitizedStr2)
return (2 * intersect(bigrams1, bigrams2).size) / (bigrams1.size + bigrams2.size)
}

module.exports = compare
module.exports = diceCoefficient
6 changes: 6 additions & 0 deletions spec/dice_coefficient_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,4 +28,10 @@ describe('dice', function () {
it('should sanitize spacing', function () {
expect(dice('the space', 'the space')).toBe(1)
})

it('should compare complete texts', function () {
const text1 = require('./test_data/Wikipedia_EN_FrenchRevolution.json').text
const text2 = require('./test_data/Wikipedia_EN_InfluenceOfTheFrenchRevolution.json').text
expect(dice(text1, text2)).toBe(0.7897503285151117)
})
})
3 changes: 3 additions & 0 deletions spec/test_data/Wikipedia_EN_FrenchRevolution.json

Large diffs are not rendered by default.

Large diffs are not rendered by default.

0 comments on commit 8b17010

Please sign in to comment.