From 285c21b42de342f052c196dd79e7fe3f3e4faed1 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Tue, 15 Jun 2021 16:37:18 +0200 Subject: [PATCH] Score repeated bigrams correctly, fixes #1959 (#1994) The current Sorensen-Dice coefficient algorithm does not correctly score strings with repeating bigrams. The score can end up being greater than 1 (the max possible score). This is because the algorithm does not consume bigrams as it matches them. The match count ends up being a count of the cartesian join of the matching bigrams. The revised algorithm in this change will consume bigrams as they are matched, preventing the cartesian join situation and providing correct scores. Co-authored-by: Tom Larsen --- .../apoc/text/SorensenDiceCoefficient.java | 44 +++++++++++++++---- .../text/SorensenDiceCoefficientTest.java | 10 +++++ 2 files changed, 45 insertions(+), 9 deletions(-) diff --git a/core/src/main/java/apoc/text/SorensenDiceCoefficient.java b/core/src/main/java/apoc/text/SorensenDiceCoefficient.java index 5783d890e1..22c458eef5 100644 --- a/core/src/main/java/apoc/text/SorensenDiceCoefficient.java +++ b/core/src/main/java/apoc/text/SorensenDiceCoefficient.java @@ -32,19 +32,33 @@ private static double compute(String input1, String input2, Locale locale) { return HIGHEST_SCORE; } + List bigrams1 = allSortedBigrams(words1); + List bigrams2 = allSortedBigrams(words2); + int index1 = 0, index2 = 0, matches = 0; + + while (index1 < bigrams1.size() && index2 < bigrams2.size()) { + Bigram bigram1 = bigrams1.get(index1); + Bigram bigram2 = bigrams2.get(index2); + if (bigram1.equals(bigram2)) { + matches++; + index1++; + index2++; + continue; + } + if (bigram1.lessThan(bigram2)) { + index1++; + continue; + } + index2++; + } - List bigrams1 = allBigrams(words1); - List bigrams2 = allBigrams(words2); - long count = bigrams2.stream() - .filter(bigrams1::contains) - .count(); - - return 2.0 * count / (bigrams1.size() + bigrams2.size()); + return 2.0 * matches / (bigrams1.size() + bigrams2.size()); } - private static List allBigrams(List words) { + private static List allSortedBigrams(List words) { return words.stream() .flatMap(s -> toStream(s.toCharArray())) + .sorted() .collect(toList()); } @@ -57,7 +71,7 @@ private static List normalizedWords(String text1, Locale locale) { } - private static class Bigram { + private static class Bigram implements Comparable { private final char first; private final char second; @@ -86,8 +100,20 @@ public char getSecond() { && Objects.equals(this.second, other.second); } + public boolean lessThan(Bigram other) { + if (this.first < other.first) {return true;} + return this.first == other.first && this.second < other.second; + } + @Override public String toString() { return String.format("[%c,%c]", first, second); } + + @Override public int compareTo(Bigram other) { + if (this == other || other == null) {return 0;} + if (this.first == other.first && this.second == other.second) {return 0;} + if (this.lessThan(other)) {return -1;} + return 1; + } } } \ No newline at end of file diff --git a/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java b/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java index 34bd496a66..3382b007ad 100644 --- a/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java +++ b/core/src/test/java/apoc/text/SorensenDiceCoefficientTest.java @@ -1,5 +1,6 @@ package apoc.text; +import org.junit.Ignore; import org.junit.Test; import static org.hamcrest.number.IsCloseTo.closeTo; @@ -50,6 +51,15 @@ public void testScoreIsProperlyComputedWithCustomLanguageTag() { assertThat(score, closeTo((expectedScore), 0.00001)); } + @Test + public void testScoreRepeatingCharactersCorrectly() { + double score = SorensenDiceCoefficient.compute("aa", "aaaaaa"); + assertThat(score, closeTo(0.333333, 0.00001)); + + score = SorensenDiceCoefficient.compute("aaaaaa", "aa"); + assertThat(score, closeTo(0.333333, 0.00001)); + } + private int countOf(String... pairs) { return pairs.length; }