From 285c21b42de342f052c196dd79e7fe3f3e4faed1 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Tue, 15 Jun 2021 16:37:18 +0200
Subject: [PATCH] Score repeated bigrams correctly, fixes #1959 (#1994)

The current Sorensen-Dice coefficient algorithm does not correctly score strings with repeating bigrams. The score can end up being greater than 1 (the max possible score). This is because the algorithm does not consume bigrams as it matches them. The match count ends up being a count of the cartesian join of the matching bigrams. The revised algorithm in this change will consume bigrams as they are matched, preventing the cartesian join situation and providing correct scores.

Co-authored-by: Tom Larsen <larsenthomasj@gmail.com>
---
 .../apoc/text/SorensenDiceCoefficient.java    | 44 +++++++++++++++----
 .../text/SorensenDiceCoefficientTest.java     | 10 +++++
 2 files changed, 45 insertions(+), 9 deletions(-)
diff --git a/core/src/main/java/apoc/text/SorensenDiceCoefficient.java b/core/src/main/java/apoc/text/SorensenDiceCoefficient.java
index 5783d890e1..22c458eef5 100644
--- a/core/src/main/java/apoc/text/SorensenDiceCoefficient.java
+++ b/core/src/main/java/apoc/text/SorensenDiceCoefficient.java
@@ -32,19 +32,33 @@ private static double compute(String input1, String input2, Locale locale) {
       return HIGHEST_SCORE;
     }
 
+    List<Bigram> bigrams1 = allSortedBigrams(words1);
+    List<Bigram> bigrams2 = allSortedBigrams(words2);
+    int index1 = 0, index2 = 0, matches = 0;
+
+    while (index1 < bigrams1.size() && index2 < bigrams2.size()) {
+      Bigram bigram1 = bigrams1.get(index1);
+      Bigram bigram2 = bigrams2.get(index2);
+      if (bigram1.equals(bigram2)) {
+        matches++;
+        index1++;
+        index2++;
+        continue;
+      }
+      if (bigram1.lessThan(bigram2)) {
+        index1++;
+        continue;
+      }
+      index2++;
+    }
 
-    List<Bigram> bigrams1 = allBigrams(words1);
-    List<Bigram> bigrams2 = allBigrams(words2);
-    long count = bigrams2.stream()
-        .filter(bigrams1::contains)
-        .count();
-
-    return 2.0 * count / (bigrams1.size() + bigrams2.size());
+    return 2.0 * matches / (bigrams1.size() + bigrams2.size());
   }
 
-  private static List<Bigram> allBigrams(List<String> words) {
+  private static List<Bigram> allSortedBigrams(List<String> words) {
     return words.stream()
                 .flatMap(s -> toStream(s.toCharArray()))
+                .sorted()
                 .collect(toList());
   }
 
@@ -57,7 +71,7 @@ private static List<String> normalizedWords(String text1, Locale locale) {
   }
 
 
-  private static class Bigram {
+  private static class Bigram implements Comparable<Bigram> {
     private final char first;
     private final char second;
 
@@ -86,8 +100,20 @@ public char getSecond() {
              && Objects.equals(this.second, other.second);
     }
 
+    public boolean lessThan(Bigram other) {
+      if (this.first < other.first) {return true;}
+      return this.first == other.first && this.second < other.second;
+    }
+
     @Override public String toString() {
       return String.format("[%c,%c]", first, second);
     }
+
+    @Override public int compareTo(Bigram other) {
+      if (this == other || other == null) {return 0;}
+      if (this.first == other.first && this.second == other.second) {return 0;}
+      if (this.lessThan(other)) {return -1;}
+      return 1;
+    }
   }
 }
\ No newline at end of file
diff --git a/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java b/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java
index 34bd496a66..3382b007ad 100644
--- a/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java
+++ b/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java
@@ -1,5 +1,6 @@
 package apoc.text;
 
+import org.junit.Ignore;
 import org.junit.Test;
 
 import static org.hamcrest.number.IsCloseTo.closeTo;
@@ -50,6 +51,15 @@ public void testScoreIsProperlyComputedWithCustomLanguageTag() {
     assertThat(score, closeTo((expectedScore), 0.00001));
   }
 
+  @Test
+  public void testScoreRepeatingCharactersCorrectly() {
+    double score = SorensenDiceCoefficient.compute("aa", "aaaaaa");
+    assertThat(score, closeTo(0.333333, 0.00001));
+
+    score = SorensenDiceCoefficient.compute("aaaaaa", "aa");
+    assertThat(score, closeTo(0.333333, 0.00001));
+  }
+
   private int countOf(String... pairs) {
     return pairs.length;
   }