Score repeated bigrams correctly, fixes #1959 (#1994)

The current Sorensen-Dice coefficient algorithm does not correctly score strings with repeating bigrams. The score can end up being greater than 1 (the max possible score). This is because the algorithm does not consume bigrams as it matches them. The match count ends up being a count of the cartesian join of the matching bigrams. The revised algorithm in this change will consume bigrams as they are matched, preventing the cartesian join situation and providing correct scores. Co-authored-by: Tom Larsen <[email protected]>
neo4j-contrib · Jun 15, 2021 · 285c21b · 285c21b
1 parent c2be0bc
commit 285c21b
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 9 deletions.
diff --git a/core/src/main/java/apoc/text/SorensenDiceCoefficient.java b/core/src/main/java/apoc/text/SorensenDiceCoefficient.java
@@ -32,19 +32,33 @@ private static double compute(String input1, String input2, Locale locale) {
       return HIGHEST_SCORE;
     }
 
+    List<Bigram> bigrams1 = allSortedBigrams(words1);
+    List<Bigram> bigrams2 = allSortedBigrams(words2);
+    int index1 = 0, index2 = 0, matches = 0;
+
+    while (index1 < bigrams1.size() && index2 < bigrams2.size()) {
+      Bigram bigram1 = bigrams1.get(index1);
+      Bigram bigram2 = bigrams2.get(index2);
+      if (bigram1.equals(bigram2)) {
+        matches++;
+        index1++;
+        index2++;
+        continue;
+      }
+      if (bigram1.lessThan(bigram2)) {
+        index1++;
+        continue;
+      }
+      index2++;
+    }
 
-    List<Bigram> bigrams1 = allBigrams(words1);
-    List<Bigram> bigrams2 = allBigrams(words2);
-    long count = bigrams2.stream()
-        .filter(bigrams1::contains)
-        .count();
-
-    return 2.0 * count / (bigrams1.size() + bigrams2.size());
+    return 2.0 * matches / (bigrams1.size() + bigrams2.size());
   }
 
-  private static List<Bigram> allBigrams(List<String> words) {
+  private static List<Bigram> allSortedBigrams(List<String> words) {
     return words.stream()
                 .flatMap(s -> toStream(s.toCharArray()))
+                .sorted()
                 .collect(toList());
   }
 
@@ -57,7 +71,7 @@ private static List<String> normalizedWords(String text1, Locale locale) {
   }
 
 
-  private static class Bigram {
+  private static class Bigram implements Comparable<Bigram> {
     private final char first;
     private final char second;
 
@@ -86,8 +100,20 @@ public char getSecond() {
              && Objects.equals(this.second, other.second);
     }
 
+    public boolean lessThan(Bigram other) {
+      if (this.first < other.first) {return true;}
+      return this.first == other.first && this.second < other.second;
+    }
+
     @Override public String toString() {
       return String.format("[%c,%c]", first, second);
     }
+
+    @Override public int compareTo(Bigram other) {
+      if (this == other || other == null) {return 0;}
+      if (this.first == other.first && this.second == other.second) {return 0;}
+      if (this.lessThan(other)) {return -1;}
+      return 1;
+    }
   }
 }
diff --git a/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java b/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java
@@ -1,5 +1,6 @@
 package apoc.text;
 
+import org.junit.Ignore;
 import org.junit.Test;
 
 import static org.hamcrest.number.IsCloseTo.closeTo;
@@ -50,6 +51,15 @@ public void testScoreIsProperlyComputedWithCustomLanguageTag() {
     assertThat(score, closeTo((expectedScore), 0.00001));
   }
 
+  @Test
+  public void testScoreRepeatingCharactersCorrectly() {
+    double score = SorensenDiceCoefficient.compute("aa", "aaaaaa");
+    assertThat(score, closeTo(0.333333, 0.00001));
+
+    score = SorensenDiceCoefficient.compute("aaaaaa", "aa");
+    assertThat(score, closeTo(0.333333, 0.00001));
+  }
+
   private int countOf(String... pairs) {
     return pairs.length;
   }