From 814ec8884fb392d92c015d6080bd1e783a085876 Mon Sep 17 00:00:00 2001 From: zacharymorn Date: Thu, 18 Nov 2021 21:36:38 -0800 Subject: [PATCH 1/2] LUCENE-10236: Update field-weight used in CombinedFieldQuery scoring calculation (#444) (cherry picked from commit 07ee3ba83a4c9f3abc24bf9d3fbb3c3102c4a102) --- .../lucene/search/CombinedFieldQuery.java | 2 +- .../search/MultiNormsLeafSimScorer.java | 8 ++ .../lucene/search/TestCombinedFieldQuery.java | 78 +++++++++++++++++++ 3 files changed, 87 insertions(+), 1 deletion(-) diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java index 2d6bf93a6aac..651aa77c0179 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/CombinedFieldQuery.java @@ -409,7 +409,7 @@ public Scorer scorer(LeafReaderContext context) throws IOException { } MultiNormsLeafSimScorer scoringSimScorer = - new MultiNormsLeafSimScorer(simWeight, context.reader(), fields, true); + new MultiNormsLeafSimScorer(simWeight, context.reader(), fieldAndWeights.values(), true); LeafSimScorer nonScoringSimScorer = new LeafSimScorer(simWeight, context.reader(), "pseudo_field", false); // we use termscorers + disjunction as an impl detail diff --git a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java index 478360485300..1d652e9da6d3 100644 --- a/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java +++ b/lucene/sandbox/src/java/org/apache/lucene/search/MultiNormsLeafSimScorer.java @@ -21,8 +21,10 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.HashSet; import java.util.List; import java.util.Objects; +import java.util.Set; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.NumericDocValues; import org.apache.lucene.search.similarities.Similarity.SimScorer; @@ -59,7 +61,13 @@ final class MultiNormsLeafSimScorer { if (needsScores) { final List normsList = new ArrayList<>(); final List weightList = new ArrayList<>(); + final Set duplicateCheckingSet = new HashSet<>(); for (FieldAndWeight field : normFields) { + assert duplicateCheckingSet.add(field.field) + : "There is a duplicated field [" + + field.field + + "] used to construct MultiNormsLeafSimScorer"; + NumericDocValues norms = reader.getNormValues(field.field); if (norms != null) { normsList.add(norms); diff --git a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java index 147f4475c941..2c7820e09178 100644 --- a/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java +++ b/lucene/sandbox/src/test/org/apache/lucene/search/TestCombinedFieldQuery.java @@ -16,6 +16,10 @@ */ package org.apache.lucene.search; +import static com.carrotsearch.randomizedtesting.RandomizedTest.atMost; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomBoolean; +import static com.carrotsearch.randomizedtesting.RandomizedTest.randomIntBetween; + import com.carrotsearch.randomizedtesting.generators.RandomPicks; import java.io.IOException; import java.util.Arrays; @@ -154,6 +158,80 @@ public void testSameScore() throws IOException { dir.close(); } + public void testScoringWithMultipleFieldTermsMatch() throws IOException { + int numMatchDoc = randomIntBetween(100, 500); + int numHits = atMost(100); + int boost1 = Math.max(1, random().nextInt(5)); + int boost2 = Math.max(1, random().nextInt(5)); + + Directory dir = newDirectory(); + Similarity similarity = randomCompatibleSimilarity(); + + IndexWriterConfig iwc = new IndexWriterConfig(); + iwc.setSimilarity(similarity); + RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc); + + // adding potentially matching doc + for (int i = 0; i < numMatchDoc; i++) { + Document doc = new Document(); + + int freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo", Store.NO)); + } + + freqA = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "foo" + j, Store.NO)); + } + } + + freqA = random().nextInt(20) + 1; + for (int j = 0; j < freqA; j++) { + doc.add(new TextField("a", "zoo", Store.NO)); + } + + int freqB = random().nextInt(20) + 1; + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo", Store.NO)); + } + + freqB = random().nextInt(20) + 1; + if (randomBoolean()) { + for (int j = 0; j < freqB; j++) { + doc.add(new TextField("b", "zoo" + j, Store.NO)); + } + } + + int freqC = random().nextInt(20) + 1; + for (int j = 0; j < freqC; j++) { + doc.add(new TextField("c", "bla" + j, Store.NO)); + } + w.addDocument(doc); + } + + IndexReader reader = w.getReader(); + IndexSearcher searcher = newSearcher(reader); + searcher.setSimilarity(similarity); + + CombinedFieldQuery query = + new CombinedFieldQuery.Builder() + .addField("a", (float) boost1) + .addField("b", (float) boost2) + .addTerm(new BytesRef("foo")) + .addTerm(new BytesRef("zoo")) + .build(); + + TopScoreDocCollector completeCollector = + TopScoreDocCollector.create(numHits, null, Integer.MAX_VALUE); + searcher.search(query, completeCollector); + + reader.close(); + w.close(); + dir.close(); + } + public void testNormsDisabled() throws IOException { Directory dir = newDirectory(); Similarity similarity = randomCompatibleSimilarity(); From 614cdf6a7698bf8a4b7d1d1983cdf054d22dcbb8 Mon Sep 17 00:00:00 2001 From: Zach Chen Date: Wed, 5 Jan 2022 21:58:47 -0800 Subject: [PATCH 2/2] LUCENE-10236: add change entry to 8.11.2 --- lucene/CHANGES.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 40a5f82c6a6c..38efba41621a 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,7 +7,9 @@ http://s.apache.org/luceneversions Bug Fixes --------------------- -(No changes) + +* LUCENE-10236: Stop duplicating norms when scoring in CombinedFieldQuery. + (Zach Chen, Jim Ferenczi, Julie Tibshirani) ======================= Lucene 8.11.1 =======================