Skip to content

Commit

Permalink
Score repeated bigrams correctly, fixes #1959 (#1994)
Browse files Browse the repository at this point in the history
The current Sorensen-Dice coefficient algorithm does not correctly score strings with repeating bigrams. The score can end up being greater than 1 (the max possible score). This is because the algorithm does not consume bigrams as it matches them. The match count ends up being a count of the cartesian join of the matching bigrams. The revised algorithm in this change will consume bigrams as they are matched, preventing the cartesian join situation and providing correct scores.

Co-authored-by: Tom Larsen <[email protected]>
  • Loading branch information
2 people authored and conker84 committed Jun 15, 2021
1 parent c2be0bc commit 285c21b
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 9 deletions.
44 changes: 35 additions & 9 deletions core/src/main/java/apoc/text/SorensenDiceCoefficient.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,19 +32,33 @@ private static double compute(String input1, String input2, Locale locale) {
return HIGHEST_SCORE;
}

List<Bigram> bigrams1 = allSortedBigrams(words1);
List<Bigram> bigrams2 = allSortedBigrams(words2);
int index1 = 0, index2 = 0, matches = 0;

while (index1 < bigrams1.size() && index2 < bigrams2.size()) {
Bigram bigram1 = bigrams1.get(index1);
Bigram bigram2 = bigrams2.get(index2);
if (bigram1.equals(bigram2)) {
matches++;
index1++;
index2++;
continue;
}
if (bigram1.lessThan(bigram2)) {
index1++;
continue;
}
index2++;
}

List<Bigram> bigrams1 = allBigrams(words1);
List<Bigram> bigrams2 = allBigrams(words2);
long count = bigrams2.stream()
.filter(bigrams1::contains)
.count();

return 2.0 * count / (bigrams1.size() + bigrams2.size());
return 2.0 * matches / (bigrams1.size() + bigrams2.size());
}

private static List<Bigram> allBigrams(List<String> words) {
private static List<Bigram> allSortedBigrams(List<String> words) {
return words.stream()
.flatMap(s -> toStream(s.toCharArray()))
.sorted()
.collect(toList());
}

Expand All @@ -57,7 +71,7 @@ private static List<String> normalizedWords(String text1, Locale locale) {
}


private static class Bigram {
private static class Bigram implements Comparable<Bigram> {
private final char first;
private final char second;

Expand Down Expand Up @@ -86,8 +100,20 @@ public char getSecond() {
&& Objects.equals(this.second, other.second);
}

public boolean lessThan(Bigram other) {
if (this.first < other.first) {return true;}
return this.first == other.first && this.second < other.second;
}

@Override public String toString() {
return String.format("[%c,%c]", first, second);
}

@Override public int compareTo(Bigram other) {
if (this == other || other == null) {return 0;}
if (this.first == other.first && this.second == other.second) {return 0;}
if (this.lessThan(other)) {return -1;}
return 1;
}
}
}
10 changes: 10 additions & 0 deletions core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
package apoc.text;

import org.junit.Ignore;
import org.junit.Test;

import static org.hamcrest.number.IsCloseTo.closeTo;
Expand Down Expand Up @@ -50,6 +51,15 @@ public void testScoreIsProperlyComputedWithCustomLanguageTag() {
assertThat(score, closeTo((expectedScore), 0.00001));
}

@Test
public void testScoreRepeatingCharactersCorrectly() {
double score = SorensenDiceCoefficient.compute("aa", "aaaaaa");
assertThat(score, closeTo(0.333333, 0.00001));

score = SorensenDiceCoefficient.compute("aaaaaa", "aa");
assertThat(score, closeTo(0.333333, 0.00001));
}

private int countOf(String... pairs) {
return pairs.length;
}
Expand Down

0 comments on commit 285c21b

Please sign in to comment.