From fb14fb200d38ba9a5d763e88099376c78ae48868 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Tue, 17 Dec 2019 15:43:11 +0000 Subject: [PATCH 1/6] Add IntervalIterator.width(); change Ordered to use width to calculate gaps; IS.matches() now returns IntervalMatchesIterator to propagate width correctly --- .../intervals/CachingMatchesIterator.java | 13 +- .../intervals/ConjunctionIntervalsSource.java | 24 +- .../intervals/DifferenceIntervalsSource.java | 6 +- .../intervals/DisjunctionIntervalsSource.java | 96 ++++- .../intervals/ExtendedIntervalsSource.java | 4 +- .../intervals/FilteredIntervalsSource.java | 4 +- .../intervals/FixedFieldIntervalsSource.java | 2 +- .../queries/intervals/IntervalIterator.java | 4 + .../queries/intervals/IntervalMatches.java | 37 +- .../intervals/IntervalMatchesIterator.java | 2 + .../queries/intervals/IntervalsSource.java | 2 +- .../MinimizingConjunctionMatchesIterator.java | 7 +- .../MinimumShouldMatchIntervalsSource.java | 9 +- .../intervals/MultiTermIntervalsSource.java | 54 ++- .../intervals/OffsetIntervalsSource.java | 4 +- .../intervals/OrderedIntervalsSource.java | 327 +++++++++++++++++- .../PayloadFilteredTermIntervalsSource.java | 16 +- .../intervals/TermIntervalsSource.java | 16 +- .../queries/intervals/TestIntervals.java | 38 +- 19 files changed, 607 insertions(+), 58 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/CachingMatchesIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/CachingMatchesIterator.java index b27224492eda..cedf955351bb 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/CachingMatchesIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/CachingMatchesIterator.java @@ -24,14 +24,14 @@ import org.apache.lucene.search.Query; import org.apache.lucene.util.ArrayUtil; -class CachingMatchesIterator extends FilterMatchesIterator { +class CachingMatchesIterator extends FilterMatchesIterator implements IntervalMatchesIterator { private boolean positioned = false; private int[] posAndOffsets = new int[4*4]; private Query[] matchingQueries = new Query[4]; private int count = 0; - CachingMatchesIterator(MatchesIterator in) { + CachingMatchesIterator(IntervalMatchesIterator in) { super(in); } @@ -133,4 +133,13 @@ public Query getQuery() { }; } + @Override + public int gaps() { + return ((IntervalMatchesIterator)in).gaps(); + } + + @Override + public int width() { + return ((IntervalMatchesIterator)in).width(); + } } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java index bdfb55c9b541..1cd2c95dd8fb 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ConjunctionIntervalsSource.java @@ -65,10 +65,10 @@ public final IntervalIterator intervals(String field, LeafReaderContext ctx) thr protected abstract IntervalIterator combine(List iterators); @Override - public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - List subs = new ArrayList<>(); + public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + List subs = new ArrayList<>(); for (IntervalsSource source : subSources) { - MatchesIterator mi = source.matches(field, ctx, doc); + IntervalMatchesIterator mi = source.matches(field, ctx, doc); if (mi == null) { return null; } @@ -87,13 +87,13 @@ public final MatchesIterator matches(String field, LeafReaderContext ctx, int do return isMinimizing ? new MinimizingConjunctionMatchesIterator(it, subs) : new ConjunctionMatchesIterator(it, subs); } - private static class ConjunctionMatchesIterator implements MatchesIterator { + private static class ConjunctionMatchesIterator implements IntervalMatchesIterator { final IntervalIterator iterator; - final List subs; + final List subs; boolean cached = true; - private ConjunctionMatchesIterator(IntervalIterator iterator, List subs) { + private ConjunctionMatchesIterator(IntervalIterator iterator, List subs) { this.iterator = iterator; this.subs = subs; } @@ -152,9 +152,19 @@ public MatchesIterator getSubMatches() throws IOException { public Query getQuery() { throw new UnsupportedOperationException(); } + + @Override + public int gaps() { + return iterator.gaps(); + } + + @Override + public int width() { + return iterator.width(); + } } - private static class SingletonMatchesIterator extends FilterMatchesIterator { + static class SingletonMatchesIterator extends FilterMatchesIterator { boolean exhausted = false; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java index 7ca3cb78a1e5..ad13667c94e5 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java @@ -48,12 +48,12 @@ public final IntervalIterator intervals(String field, LeafReaderContext ctx) thr } @Override - public final MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - MatchesIterator minIt = minuend.matches(field, ctx, doc); + public final IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + IntervalMatchesIterator minIt = minuend.matches(field, ctx, doc); if (minIt == null) { return null; } - MatchesIterator subIt = subtrahend.matches(field, ctx, doc); + IntervalMatchesIterator subIt = subtrahend.matches(field, ctx, doc); if (subIt == null) { return minIt; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java index 67d6f6fbf6b2..9e74838648f1 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java @@ -82,15 +82,24 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - List subMatches = new ArrayList<>(); + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + List subMatches = new ArrayList<>(); for (IntervalsSource subSource : subSources) { - MatchesIterator mi = subSource.matches(field, ctx, doc); + IntervalMatchesIterator mi = subSource.matches(field, ctx, doc); if (mi != null) { subMatches.add(mi); } } - return MatchesUtils.disjunction(subMatches); + if (subMatches.size() == 0) { + return null; + } + DisjunctionIntervalIterator it = new DisjunctionIntervalIterator( + subMatches.stream().map(m -> IntervalMatches.wrapMatches(m, doc)).collect(Collectors.toList()) + ); + if (it.advance(doc) != doc) { + return null; + } + return new DisjunctionMatchesIterator(it, subMatches); } @Override @@ -196,6 +205,21 @@ private void reset() throws IOException { current = EMPTY; } + int currentOrd() { + if (current == EMPTY) { + return -1; + } + if (current == EXHAUSTED) { + return NO_MORE_INTERVALS; + } + for (int i = 0; i < iterators.size(); i++) { + if (iterators.get(i) == current) { + return i; + } + } + throw new IllegalStateException(); + } + @Override public int nextInterval() throws IOException { if (current == EMPTY || current == EXHAUSTED) { @@ -344,4 +368,68 @@ public float matchCost() { } }; + private static class DisjunctionMatchesIterator implements IntervalMatchesIterator { + + final DisjunctionIntervalIterator it; + final List subs; + + private DisjunctionMatchesIterator(DisjunctionIntervalIterator it, List subs) { + this.it = it; + this.subs = subs; + } + + @Override + public boolean next() throws IOException { + return it.nextInterval() != IntervalIterator.NO_MORE_INTERVALS; + } + + @Override + public int startPosition() { + return it.start(); + } + + @Override + public int endPosition() { + return it.end(); + } + + @Override + public int startOffset() throws IOException { + int ord = it.currentOrd(); + assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; + return subs.get(ord).startOffset(); + } + + @Override + public int endOffset() throws IOException { + int ord = it.currentOrd(); + assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; + return subs.get(ord).endOffset(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + int ord = it.currentOrd(); + assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; + return subs.get(ord).getSubMatches(); + } + + @Override + public Query getQuery() { + int ord = it.currentOrd(); + assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; + return subs.get(ord).getQuery(); + } + + @Override + public int gaps() { + return it.gaps(); + } + + @Override + public int width() { + return it.width(); + } + } + } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java index 00eb86680581..4a67aa14a72e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java @@ -49,8 +49,8 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - MatchesIterator in = source.matches(field, ctx, doc); + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + IntervalMatchesIterator in = source.matches(field, ctx, doc); if (in == null) { return null; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java index 05ecc4c143fa..3118095af4ca 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java @@ -108,8 +108,8 @@ protected boolean accept() { } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - MatchesIterator mi = in.matches(field, ctx, doc); + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + IntervalMatchesIterator mi = in.matches(field, ctx, doc); if (mi == null) { return null; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java index 94cba9a2b827..c8ce33a36a9a 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java @@ -43,7 +43,7 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { return source.matches(this.field, ctx, doc); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java index b97ce05231e7..a1d8da4d4011 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java @@ -72,6 +72,10 @@ public abstract class IntervalIterator extends DocIdSetIterator { */ public abstract int gaps(); + public int width() { + return end() - start() + 1; + } + /** * Advance the iterator to the next interval * diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java index 2bc10f66531b..99c79c302300 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java @@ -24,7 +24,7 @@ final class IntervalMatches { - static MatchesIterator asMatches(IntervalIterator iterator, MatchesIterator source, int doc) throws IOException { + static IntervalMatchesIterator asMatches(IntervalIterator iterator, IntervalMatchesIterator source, int doc) throws IOException { if (source == null) { return null; } @@ -34,7 +34,7 @@ static MatchesIterator asMatches(IntervalIterator iterator, MatchesIterator sour if (iterator.nextInterval() == IntervalIterator.NO_MORE_INTERVALS) { return null; } - return new MatchesIterator() { + return new IntervalMatchesIterator() { boolean cached = true; @@ -67,6 +67,16 @@ public int endOffset() throws IOException { return source.endOffset(); } + @Override + public int gaps() { + return iterator.gaps(); + } + + @Override + public int width() { + return iterator.width(); + } + @Override public MatchesIterator getSubMatches() throws IOException { return source.getSubMatches(); @@ -79,21 +89,27 @@ public Query getQuery() { }; } - enum State { UNPOSITIONED, ITERATING, EXHAUSTED } + enum State { UNPOSITIONED, ITERATING, NO_MORE_INTERVALS, EXHAUSTED } - static IntervalIterator wrapMatches(MatchesIterator mi, int doc) { + static IntervalIterator wrapMatches(IntervalMatchesIterator mi, int doc) { return new IntervalIterator() { State state = State.UNPOSITIONED; @Override public int start() { + if (state == State.NO_MORE_INTERVALS) { + return NO_MORE_INTERVALS; + } assert state == State.ITERATING; return mi.startPosition(); } @Override public int end() { + if (state == State.NO_MORE_INTERVALS) { + return NO_MORE_INTERVALS; + } assert state == State.ITERATING; return mi.endPosition(); } @@ -101,10 +117,13 @@ public int end() { @Override public int gaps() { assert state == State.ITERATING; - if (mi instanceof IntervalMatchesIterator) { - return ((IntervalMatchesIterator)mi).gaps(); - } - return 0; + return mi.gaps(); + } + + @Override + public int width() { + assert state == State.ITERATING; + return mi.width(); } @Override @@ -113,6 +132,7 @@ public int nextInterval() throws IOException { if (mi.next()) { return mi.startPosition(); } + state = State.NO_MORE_INTERVALS; return NO_MORE_INTERVALS; } @@ -127,6 +147,7 @@ public int docID() { case UNPOSITIONED: return -1; case ITERATING: + case NO_MORE_INTERVALS: return doc; case EXHAUSTED: } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java index 2c3d31d81c3d..06f057515181 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java @@ -35,4 +35,6 @@ interface IntervalMatchesIterator extends MatchesIterator { */ int gaps(); + int width(); + } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalsSource.java index ae23978f1c62..f1c89d3faff4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalsSource.java @@ -53,7 +53,7 @@ public abstract class IntervalsSource { * @param ctx the document's context * @param doc the document to return matches for */ - public abstract MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException; + public abstract IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException; /** * Expert: visit the tree of sources diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimizingConjunctionMatchesIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimizingConjunctionMatchesIterator.java index 2a7490c996de..4477991315a4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimizingConjunctionMatchesIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimizingConjunctionMatchesIterator.java @@ -31,7 +31,7 @@ class MinimizingConjunctionMatchesIterator implements IntervalMatchesIterator { private final List subs = new ArrayList<>(); private boolean cached = true; - MinimizingConjunctionMatchesIterator(IntervalIterator iterator, List subs) { + MinimizingConjunctionMatchesIterator(IntervalIterator iterator, List subs) { this.iterator = iterator; for (MatchesIterator mi : subs) { assert mi instanceof CachingMatchesIterator; @@ -83,6 +83,11 @@ public int gaps() { return iterator.gaps(); } + @Override + public int width() { + return iterator.width(); + } + @Override public MatchesIterator getSubMatches() throws IOException { List mis = new ArrayList<>(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java index 4ceb8f23b6d4..08e80f034baa 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java @@ -63,10 +63,10 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { Map lookup = new IdentityHashMap<>(); for (IntervalsSource source : sources) { - MatchesIterator mi = source.matches(field, ctx, doc); + IntervalMatchesIterator mi = source.matches(field, ctx, doc); if (mi != null) { CachingMatchesIterator cmi = new CachingMatchesIterator(mi); lookup.put(IntervalMatches.wrapMatches(cmi, doc), cmi); @@ -389,6 +389,11 @@ public int gaps() { return iterator.gaps(); } + @Override + public int width() { + return iterator.width(); + } + @Override public MatchesIterator getSubMatches() throws IOException { List mis = new ArrayList<>(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java index 589f9c6e3fc2..ff895397c2dc 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MultiTermIntervalsSource.java @@ -30,6 +30,7 @@ import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.automaton.CompiledAutomaton; @@ -73,7 +74,7 @@ public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IO } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { Terms terms = ctx.reader().terms(field); if (terms == null) { return null; @@ -91,7 +92,56 @@ public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) thr } } } - return MatchesUtils.disjunction(subMatches); + MatchesIterator mi = MatchesUtils.disjunction(subMatches); + if (mi == null) { + return null; + } + return new IntervalMatchesIterator() { + @Override + public int gaps() { + return 0; + } + + @Override + public int width() { + return 1; + } + + @Override + public boolean next() throws IOException { + return mi.next(); + } + + @Override + public int startPosition() { + return mi.startPosition(); + } + + @Override + public int endPosition() { + return mi.endPosition(); + } + + @Override + public int startOffset() throws IOException { + return mi.startOffset(); + } + + @Override + public int endOffset() throws IOException { + return mi.endOffset(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + return mi.getSubMatches(); + } + + @Override + public Query getQuery() { + return mi.getQuery(); + } + }; } @Override diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java index 62e56673efd2..370d59b532a4 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java @@ -137,8 +137,8 @@ public long cost() { } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - MatchesIterator mi = in.matches(field, ctx, doc); + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + IntervalMatchesIterator mi = in.matches(field, ctx, doc); if (mi == null) { return null; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index e736a59be59f..90a82c537f0b 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -19,18 +19,30 @@ import java.io.IOException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collection; +import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; + class OrderedIntervalsSource extends ConjunctionIntervalsSource { static IntervalsSource build(List sources) { if (sources.size() == 1) { return sources.get(0); } - return new OrderedIntervalsSource(flatten(sources)); + List rewritten = deduplicate(flatten(sources)); + if (rewritten.size() == 1) { + return rewritten.get(0); + } + return new OrderedIntervalsSource(rewritten); } private static List flatten(List sources) { @@ -46,6 +58,32 @@ private static List flatten(List sources) { return flattened; } + private static List deduplicate(List sources) { + List deduplicated = new ArrayList<>(); + List current = new ArrayList<>(); + for (IntervalsSource source : sources) { + if (current.size() == 0 || current.get(0).equals(source)) { + current.add(source); + } + else { + if (current.size() == 1) { + deduplicated.add(current.get(0)); + } + else { + deduplicated.add(new DuplicateIntervalsSource(current.get(0), current.size())); + } + current.clear(); + current.add(source); + } + } + if (current.size() == 1) { + deduplicated.add(current.get(0)); + } else { + deduplicated.add(new DuplicateIntervalsSource(current.get(0), current.size())); + } + return deduplicated; + } + private OrderedIntervalsSource(List sources) { super(sources, true); } @@ -89,7 +127,7 @@ public String toString() { private static class OrderedIntervalIterator extends ConjunctionIntervalIterator { int start = -1, end = -1, i; - int firstEnd; + int slop; private OrderedIntervalIterator(List subIntervals) { super(subIntervals); @@ -107,17 +145,17 @@ public int end() { @Override public int nextInterval() throws IOException { - start = end = IntervalIterator.NO_MORE_INTERVALS; - int b = Integer.MAX_VALUE; + start = end = slop = IntervalIterator.NO_MORE_INTERVALS; + int lastStart = Integer.MAX_VALUE; i = 1; while (true) { while (true) { - if (subIterators.get(i - 1).end() >= b) + if (subIterators.get(i - 1).end() >= lastStart) return start; if (i == subIterators.size() || subIterators.get(i).start() > subIterators.get(i - 1).end()) break; do { - if (subIterators.get(i).end() >= b || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) + if (subIterators.get(i).end() >= lastStart || subIterators.get(i).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) return start; } while (subIterators.get(i).start() <= subIterators.get(i - 1).end()); @@ -127,9 +165,12 @@ public int nextInterval() throws IOException { if (start == NO_MORE_INTERVALS) { return end = NO_MORE_INTERVALS; } - firstEnd = subIterators.get(0).end(); end = subIterators.get(subIterators.size() - 1).end(); - b = subIterators.get(subIterators.size() - 1).start(); + slop = end - start + 1; + for (IntervalIterator subIterator : subIterators) { + slop -= subIterator.width(); + } + lastStart = subIterators.get(subIterators.size() - 1).start(); i = 1; if (subIterators.get(0).nextInterval() == IntervalIterator.NO_MORE_INTERVALS) return start; @@ -138,18 +179,276 @@ public int nextInterval() throws IOException { @Override public int gaps() { - int gaps = subIterators.get(1).start() - firstEnd - 1; - for (int i = 2; i < subIterators.size(); i++) { - gaps += (subIterators.get(i).start() - subIterators.get(i - 1).end() - 1); - } - return gaps; + return slop; } @Override protected void reset() throws IOException { subIterators.get(0).nextInterval(); i = 1; - start = end = firstEnd = -1; + start = end = slop = -1; + } + } + + private static class DuplicateIntervalsSource extends IntervalsSource { + + final IntervalsSource in; + final int childCount; + + private DuplicateIntervalsSource(IntervalsSource in, int childCount) { + this.in = in; + this.childCount = childCount; + } + + @Override + public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { + IntervalIterator it = in.intervals(field, ctx); + if (it == null) { + return null; + } + return new DuplicateIntervalIterator(it, childCount); + } + + @Override + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + List subs = new ArrayList<>(); + for (int i = 0; i < childCount; i++) { + IntervalMatchesIterator mi = in.matches(field, ctx, doc); + if (mi == null) { + return null; + } + subs.add(mi); + } + return DuplicateMatchesIterator.build(subs); + } + + @Override + public void visit(String field, QueryVisitor visitor) { + in.visit(field, visitor); + } + + @Override + public int minExtent() { + return in.minExtent(); + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + + @Override + public int hashCode() { + return Objects.hash(in, childCount); + } + + @Override + public boolean equals(Object other) { + if (other instanceof DuplicateIntervalsSource == false) return false; + DuplicateIntervalsSource o = (DuplicateIntervalsSource) other; + return Objects.equals(this.in, o.in) && Objects.equals(this.childCount, o.childCount); + } + + @Override + public String toString() { + String s = in.toString(); + StringBuilder out = new StringBuilder(s); + for (int i = 1; i < childCount; i++) { + out.append(",").append(s); + } + return out.toString(); + } + } + + private static class DuplicateIntervalIterator extends IntervalIterator { + + private final IntervalIterator in; + final int[] cache; + final int cacheLength; + int cacheBase; + boolean started = false; + boolean exhausted = false; + + private DuplicateIntervalIterator(IntervalIterator primary, int copies) { + this.in = primary; + this.cacheLength = copies; + this.cache = new int[this.cacheLength * 2]; + } + + @Override + public int start() { + return exhausted ? NO_MORE_INTERVALS : cache[(cacheBase % cacheLength) * 2]; + } + + @Override + public int end() { + return exhausted ? NO_MORE_INTERVALS : cache[(((cacheBase + cacheLength - 1) % cacheLength) * 2) + 1]; + } + + @Override + public int width() { + int width = 0; + for (int i = 0; i < cacheLength; i++) { + int pos = (cacheBase + i) % cacheLength; + width += cache[pos * 2] - cache[pos * 2 + 1] + 1; + } + return width; + } + + @Override + public int gaps() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextInterval() throws IOException { + if (exhausted) { + return NO_MORE_INTERVALS; + } + if (started == false) { + for (int i = 0; i < cacheLength; i++) { + if (cacheNextInterval(i) == NO_MORE_INTERVALS) { + return NO_MORE_INTERVALS; + } + } + cacheBase = 0; + started = true; + return start(); + } + else { + int insert = (cacheBase + cacheLength) % cacheLength; + cacheBase = (cacheBase + 1) % cacheLength; + return cacheNextInterval(insert); + } + } + + private int cacheNextInterval(int linePos) throws IOException { + if (in.nextInterval() == NO_MORE_INTERVALS) { + exhausted = true; + return NO_MORE_INTERVALS; + } + cache[linePos * 2] = in.start(); + cache[linePos * 2 + 1] = in.end(); + return start(); + } + + @Override + public float matchCost() { + return in.matchCost(); + } + + @Override + public int docID() { + return in.docID(); + } + + @Override + public int nextDoc() throws IOException { + started = exhausted = false; + Arrays.fill(cache, -1); + return in.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + started = exhausted = false; + Arrays.fill(cache, -1); + return in.advance(target); + } + + @Override + public long cost() { + return in.cost(); + } + } + + private static class DuplicateMatchesIterator implements IntervalMatchesIterator { + + List subs; + boolean cached = false; + + static IntervalMatchesIterator build(List subs) throws IOException { + int count = subs.size(); + while (count > 0) { + for (int i = 0; i < count; i++) { + if (subs.get(count - 1).next() == false) { + return null; + } + } + count--; + } + return new DuplicateMatchesIterator(subs); + } + + private DuplicateMatchesIterator(List subs) throws IOException { + this.subs = subs; + } + + @Override + public boolean next() throws IOException { + if (cached == false) { + return cached = true; + } + if (subs.get(subs.size() - 1).next() == false) { + return false; + } + for (int i = 0; i < subs.size() - 1; i++) { + subs.get(i).next(); + } + return true; + } + + @Override + public int startPosition() { + return subs.get(0).startPosition(); + } + + @Override + public int endPosition() { + return subs.get(subs.size() - 1).endPosition(); + } + + @Override + public int startOffset() throws IOException { + return subs.get(0).startOffset(); + } + + @Override + public int endOffset() throws IOException { + return subs.get(subs.size() - 1).endOffset(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + List subMatches = new ArrayList<>(); + for (MatchesIterator mi : subs) { + MatchesIterator sub = mi.getSubMatches(); + if (sub == null) { + sub = new SingletonMatchesIterator(mi); + } + subMatches.add(sub); + } + return MatchesUtils.disjunction(subMatches); + } + + @Override + public Query getQuery() { + throw new UnsupportedOperationException(); + } + + @Override + public int gaps() { + return 0; + } + + @Override + public int width() { + int width = endPosition() - startPosition() + 1; + for (MatchesIterator mi : subs) { + width = width - (mi.endPosition() - mi.startPosition() + 1); + } + return width; } } + } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/PayloadFilteredTermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/PayloadFilteredTermIntervalsSource.java index 25e4da5d9c4a..1ec20e013063 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/PayloadFilteredTermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/PayloadFilteredTermIntervalsSource.java @@ -143,7 +143,7 @@ public String toString() { } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { Terms terms = ctx.reader().terms(field); if (terms == null) return null; @@ -165,12 +165,22 @@ public void visit(String field, QueryVisitor visitor) { visitor.consumeTerms(new IntervalQuery(field, this), new Term(field, term)); } - private MatchesIterator matches(TermsEnum te, int doc) throws IOException { + private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException { PostingsEnum pe = te.postings(null, PostingsEnum.ALL); if (pe.advance(doc) != doc) { return null; } - return new MatchesIterator() { + return new IntervalMatchesIterator() { + + @Override + public int gaps() { + return 0; + } + + @Override + public int width() { + return 1; + } int upto = pe.freq(); int pos = -1; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java index ac98b255c77e..16c886b88d0f 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/TermIntervalsSource.java @@ -139,7 +139,7 @@ public String toString() { } @Override - public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { Terms terms = ctx.reader().terms(field); if (terms == null) return null; @@ -153,13 +153,23 @@ public MatchesIterator matches(String field, LeafReaderContext ctx, int doc) thr return matches(te, doc, field); } - static MatchesIterator matches(TermsEnum te, int doc, String field) throws IOException { + static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException { TermQuery query = new TermQuery(new Term(field, te.term())); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); if (pe.advance(doc) != doc) { return null; } - return new MatchesIterator() { + return new IntervalMatchesIterator() { + + @Override + public int gaps() { + return 0; + } + + @Override + public int width() { + return 1; + } int upto = pe.freq(); int pos = -1; diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index f98b3195b451..194576662de5 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -120,6 +120,7 @@ private void checkIntervals(IntervalsSource source, String field, int expectedMa continue; for (int doc = 0; doc < ctx.reader().maxDoc(); doc++) { ids.advance(doc); + MatchesIterator mi = source.matches(field, ctx, doc); int id = (int) ids.longValue(); if (intervals.docID() == doc || (intervals.docID() < doc && intervals.advance(doc) == doc)) { @@ -134,15 +135,23 @@ private void checkIntervals(IntervalsSource source, String field, int expectedMa assertEquals("start() != pos returned from nextInterval()", expected[id][i], intervals.start()); assertEquals("Wrong end value in doc " + id, expected[id][i + 1], intervals.end()); i += 2; + assertTrue(mi.next()); + assertEquals(source + ": wrong start value in match in doc " + id, intervals.start(), mi.startPosition()); + assertEquals(source + ": wrong end value in match in doc " + id, intervals.end(), mi.endPosition()); } assertEquals(source + ": wrong number of endpoints in doc " + id, expected[id].length, i); assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.start()); assertEquals(IntervalIterator.NO_MORE_INTERVALS, intervals.end()); - if (i > 0) + if (i > 0) { matchedDocs++; + assertFalse(mi.next()); + } else { + assertNull("Expected null matches iterator on doc " + id, mi); + } } else { assertEquals(0, expected[id].length); + assertNull(mi); } } } @@ -272,6 +281,24 @@ public void testOrderedNearIntervals() throws IOException { checkVisits(source, 3, "pease", "hot"); } + public void testOrderedNearWithDuplicates() throws IOException { + IntervalsSource source = Intervals.ordered(Intervals.term("pease"), Intervals.term("pease"), Intervals.term("porridge")); + checkIntervals(source, "field1", 3, new int[][]{ + {}, { 0, 4, 3, 7 }, { 0, 4, 3, 7 }, {}, { 0, 4, 3, 7 }, {} + }); + assertGaps(source, 1, "field1", new int[]{ 2, 2 }); + + MatchesIterator mi = getMatches(source, 1, "field1"); + assertMatch(mi, 0, 4, 0, 34); + MatchesIterator sub = mi.getSubMatches(); + assertNotNull(sub); + assertMatch(sub, 0, 0, 0, 5); + assertMatch(sub, 3, 3, 20, 25); + assertMatch(sub, 4, 4, 26, 34); + assertMatch(mi, 3, 7, 20, 55); + assertFalse(mi.next()); + } + public void testPhraseIntervals() throws IOException { IntervalsSource source = Intervals.phrase("pease", "porridge"); checkIntervals(source, "field1", 3, new int[][]{ @@ -624,6 +651,15 @@ public void testMaxGaps() throws IOException { } + public void testMaxGapsWithRepeats() throws IOException { + IntervalsSource source = Intervals.maxgaps(11, + Intervals.ordered(Intervals.term("pease"), Intervals.term("pease"), Intervals.term("hot"))); + checkIntervals(source, "field1", 1, new int[][]{ + {}, {}, { 0, 5 }, {}, {}, {} + }); + assertGaps(source, 2, "field1", new int[]{ 3 }); + } + public void testNestedMaxGaps() throws IOException { IntervalsSource source = Intervals.maxgaps(1, Intervals.unordered( From aa0bdcc279665998bedd516bd0e41f4f032af649 Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 18 Dec 2019 09:20:43 +0000 Subject: [PATCH 2/6] handle duplicates in unordered query --- .../intervals/DuplicateIntervalsSource.java | 301 ++++++++++++++++++ .../intervals/OrderedIntervalsSource.java | 277 +--------------- .../intervals/UnorderedIntervalsSource.java | 42 +-- .../queries/intervals/TestIntervals.java | 22 ++ 4 files changed, 346 insertions(+), 296 deletions(-) create mode 100644 lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java new file mode 100644 index 000000000000..1244f5262eea --- /dev/null +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java @@ -0,0 +1,301 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.lucene.queries.intervals; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.List; +import java.util.Objects; + +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.search.MatchesIterator; +import org.apache.lucene.search.MatchesUtils; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.QueryVisitor; + +class DuplicateIntervalsSource extends IntervalsSource { + + static IntervalsSource build(IntervalsSource in, int childCount) { + if (childCount == 1) { + return in; + } + assert childCount > 0; + return new DuplicateIntervalsSource(in, childCount); + } + + final IntervalsSource in; + final int childCount; + + private DuplicateIntervalsSource(IntervalsSource in, int childCount) { + this.in = in; + this.childCount = childCount; + } + + @Override + public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { + IntervalIterator it = in.intervals(field, ctx); + if (it == null) { + return null; + } + return new DuplicateIntervalIterator(it, childCount); + } + + @Override + public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { + List subs = new ArrayList<>(); + for (int i = 0; i < childCount; i++) { + IntervalMatchesIterator mi = in.matches(field, ctx, doc); + if (mi == null) { + return null; + } + subs.add(mi); + } + return DuplicateMatchesIterator.build(subs); + } + + @Override + public void visit(String field, QueryVisitor visitor) { + in.visit(field, visitor); + } + + @Override + public int minExtent() { + return in.minExtent(); + } + + @Override + public Collection pullUpDisjunctions() { + return Collections.singleton(this); + } + + @Override + public int hashCode() { + return Objects.hash(in, childCount); + } + + @Override + public boolean equals(Object other) { + if (other instanceof DuplicateIntervalsSource == false) return false; + DuplicateIntervalsSource o = (DuplicateIntervalsSource) other; + return Objects.equals(this.in, o.in) && Objects.equals(this.childCount, o.childCount); + } + + @Override + public String toString() { + String s = in.toString(); + StringBuilder out = new StringBuilder(s); + for (int i = 1; i < childCount; i++) { + out.append(",").append(s); + } + return out.toString(); + } + + private static class DuplicateIntervalIterator extends IntervalIterator { + + private final IntervalIterator in; + final int[] cache; + final int cacheLength; + int cacheBase; + boolean started = false; + boolean exhausted = false; + + private DuplicateIntervalIterator(IntervalIterator primary, int copies) { + this.in = primary; + this.cacheLength = copies; + this.cache = new int[this.cacheLength * 2]; + } + + @Override + public int start() { + return exhausted ? NO_MORE_INTERVALS : cache[(cacheBase % cacheLength) * 2]; + } + + @Override + public int end() { + return exhausted ? NO_MORE_INTERVALS : cache[(((cacheBase + cacheLength - 1) % cacheLength) * 2) + 1]; + } + + @Override + public int width() { + int width = 0; + for (int i = 0; i < cacheLength; i++) { + int pos = (cacheBase + i) % cacheLength; + width += cache[pos * 2] - cache[pos * 2 + 1] + 1; + } + return width; + } + + @Override + public int gaps() { + throw new UnsupportedOperationException(); + } + + @Override + public int nextInterval() throws IOException { + if (exhausted) { + return NO_MORE_INTERVALS; + } + if (started == false) { + for (int i = 0; i < cacheLength; i++) { + if (cacheNextInterval(i) == NO_MORE_INTERVALS) { + return NO_MORE_INTERVALS; + } + } + cacheBase = 0; + started = true; + return start(); + } + else { + int insert = (cacheBase + cacheLength) % cacheLength; + cacheBase = (cacheBase + 1) % cacheLength; + return cacheNextInterval(insert); + } + } + + private int cacheNextInterval(int linePos) throws IOException { + if (in.nextInterval() == NO_MORE_INTERVALS) { + exhausted = true; + return NO_MORE_INTERVALS; + } + cache[linePos * 2] = in.start(); + cache[linePos * 2 + 1] = in.end(); + return start(); + } + + @Override + public float matchCost() { + return in.matchCost(); + } + + @Override + public int docID() { + return in.docID(); + } + + @Override + public int nextDoc() throws IOException { + started = exhausted = false; + Arrays.fill(cache, -1); + return in.nextDoc(); + } + + @Override + public int advance(int target) throws IOException { + started = exhausted = false; + Arrays.fill(cache, -1); + return in.advance(target); + } + + @Override + public long cost() { + return in.cost(); + } + } + + private static class DuplicateMatchesIterator implements IntervalMatchesIterator { + + List subs; + boolean cached = false; + + static IntervalMatchesIterator build(List subs) throws IOException { + int count = subs.size(); + while (count > 0) { + for (int i = 0; i < count; i++) { + if (subs.get(count - 1).next() == false) { + return null; + } + } + count--; + } + return new DuplicateMatchesIterator(subs); + } + + private DuplicateMatchesIterator(List subs) throws IOException { + this.subs = subs; + } + + @Override + public boolean next() throws IOException { + if (cached == false) { + return cached = true; + } + if (subs.get(subs.size() - 1).next() == false) { + return false; + } + for (int i = 0; i < subs.size() - 1; i++) { + subs.get(i).next(); + } + return true; + } + + @Override + public int startPosition() { + return subs.get(0).startPosition(); + } + + @Override + public int endPosition() { + return subs.get(subs.size() - 1).endPosition(); + } + + @Override + public int startOffset() throws IOException { + return subs.get(0).startOffset(); + } + + @Override + public int endOffset() throws IOException { + return subs.get(subs.size() - 1).endOffset(); + } + + @Override + public MatchesIterator getSubMatches() throws IOException { + List subMatches = new ArrayList<>(); + for (MatchesIterator mi : subs) { + MatchesIterator sub = mi.getSubMatches(); + if (sub == null) { + sub = new ConjunctionIntervalsSource.SingletonMatchesIterator(mi); + } + subMatches.add(sub); + } + return MatchesUtils.disjunction(subMatches); + } + + @Override + public Query getQuery() { + throw new UnsupportedOperationException(); + } + + @Override + public int gaps() { + return 0; + } + + @Override + public int width() { + int width = endPosition() - startPosition() + 1; + for (MatchesIterator mi : subs) { + width = width - (mi.endPosition() - mi.startPosition() + 1); + } + return width; + } + } +} diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index 90a82c537f0b..1bf6c37f52c1 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -21,16 +21,13 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; -import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.MatchesUtils; import org.apache.lucene.search.Query; -import org.apache.lucene.search.QueryVisitor; class OrderedIntervalsSource extends ConjunctionIntervalsSource { @@ -66,21 +63,12 @@ private static List deduplicate(List sources) current.add(source); } else { - if (current.size() == 1) { - deduplicated.add(current.get(0)); - } - else { - deduplicated.add(new DuplicateIntervalsSource(current.get(0), current.size())); - } + deduplicated.add(DuplicateIntervalsSource.build(current.get(0), current.size())); current.clear(); current.add(source); } } - if (current.size() == 1) { - deduplicated.add(current.get(0)); - } else { - deduplicated.add(new DuplicateIntervalsSource(current.get(0), current.size())); - } + deduplicated.add(DuplicateIntervalsSource.build(current.get(0), current.size())); return deduplicated; } @@ -190,265 +178,4 @@ protected void reset() throws IOException { } } - private static class DuplicateIntervalsSource extends IntervalsSource { - - final IntervalsSource in; - final int childCount; - - private DuplicateIntervalsSource(IntervalsSource in, int childCount) { - this.in = in; - this.childCount = childCount; - } - - @Override - public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { - IntervalIterator it = in.intervals(field, ctx); - if (it == null) { - return null; - } - return new DuplicateIntervalIterator(it, childCount); - } - - @Override - public IntervalMatchesIterator matches(String field, LeafReaderContext ctx, int doc) throws IOException { - List subs = new ArrayList<>(); - for (int i = 0; i < childCount; i++) { - IntervalMatchesIterator mi = in.matches(field, ctx, doc); - if (mi == null) { - return null; - } - subs.add(mi); - } - return DuplicateMatchesIterator.build(subs); - } - - @Override - public void visit(String field, QueryVisitor visitor) { - in.visit(field, visitor); - } - - @Override - public int minExtent() { - return in.minExtent(); - } - - @Override - public Collection pullUpDisjunctions() { - return Collections.singleton(this); - } - - @Override - public int hashCode() { - return Objects.hash(in, childCount); - } - - @Override - public boolean equals(Object other) { - if (other instanceof DuplicateIntervalsSource == false) return false; - DuplicateIntervalsSource o = (DuplicateIntervalsSource) other; - return Objects.equals(this.in, o.in) && Objects.equals(this.childCount, o.childCount); - } - - @Override - public String toString() { - String s = in.toString(); - StringBuilder out = new StringBuilder(s); - for (int i = 1; i < childCount; i++) { - out.append(",").append(s); - } - return out.toString(); - } - } - - private static class DuplicateIntervalIterator extends IntervalIterator { - - private final IntervalIterator in; - final int[] cache; - final int cacheLength; - int cacheBase; - boolean started = false; - boolean exhausted = false; - - private DuplicateIntervalIterator(IntervalIterator primary, int copies) { - this.in = primary; - this.cacheLength = copies; - this.cache = new int[this.cacheLength * 2]; - } - - @Override - public int start() { - return exhausted ? NO_MORE_INTERVALS : cache[(cacheBase % cacheLength) * 2]; - } - - @Override - public int end() { - return exhausted ? NO_MORE_INTERVALS : cache[(((cacheBase + cacheLength - 1) % cacheLength) * 2) + 1]; - } - - @Override - public int width() { - int width = 0; - for (int i = 0; i < cacheLength; i++) { - int pos = (cacheBase + i) % cacheLength; - width += cache[pos * 2] - cache[pos * 2 + 1] + 1; - } - return width; - } - - @Override - public int gaps() { - throw new UnsupportedOperationException(); - } - - @Override - public int nextInterval() throws IOException { - if (exhausted) { - return NO_MORE_INTERVALS; - } - if (started == false) { - for (int i = 0; i < cacheLength; i++) { - if (cacheNextInterval(i) == NO_MORE_INTERVALS) { - return NO_MORE_INTERVALS; - } - } - cacheBase = 0; - started = true; - return start(); - } - else { - int insert = (cacheBase + cacheLength) % cacheLength; - cacheBase = (cacheBase + 1) % cacheLength; - return cacheNextInterval(insert); - } - } - - private int cacheNextInterval(int linePos) throws IOException { - if (in.nextInterval() == NO_MORE_INTERVALS) { - exhausted = true; - return NO_MORE_INTERVALS; - } - cache[linePos * 2] = in.start(); - cache[linePos * 2 + 1] = in.end(); - return start(); - } - - @Override - public float matchCost() { - return in.matchCost(); - } - - @Override - public int docID() { - return in.docID(); - } - - @Override - public int nextDoc() throws IOException { - started = exhausted = false; - Arrays.fill(cache, -1); - return in.nextDoc(); - } - - @Override - public int advance(int target) throws IOException { - started = exhausted = false; - Arrays.fill(cache, -1); - return in.advance(target); - } - - @Override - public long cost() { - return in.cost(); - } - } - - private static class DuplicateMatchesIterator implements IntervalMatchesIterator { - - List subs; - boolean cached = false; - - static IntervalMatchesIterator build(List subs) throws IOException { - int count = subs.size(); - while (count > 0) { - for (int i = 0; i < count; i++) { - if (subs.get(count - 1).next() == false) { - return null; - } - } - count--; - } - return new DuplicateMatchesIterator(subs); - } - - private DuplicateMatchesIterator(List subs) throws IOException { - this.subs = subs; - } - - @Override - public boolean next() throws IOException { - if (cached == false) { - return cached = true; - } - if (subs.get(subs.size() - 1).next() == false) { - return false; - } - for (int i = 0; i < subs.size() - 1; i++) { - subs.get(i).next(); - } - return true; - } - - @Override - public int startPosition() { - return subs.get(0).startPosition(); - } - - @Override - public int endPosition() { - return subs.get(subs.size() - 1).endPosition(); - } - - @Override - public int startOffset() throws IOException { - return subs.get(0).startOffset(); - } - - @Override - public int endOffset() throws IOException { - return subs.get(subs.size() - 1).endOffset(); - } - - @Override - public MatchesIterator getSubMatches() throws IOException { - List subMatches = new ArrayList<>(); - for (MatchesIterator mi : subs) { - MatchesIterator sub = mi.getSubMatches(); - if (sub == null) { - sub = new SingletonMatchesIterator(mi); - } - subMatches.add(sub); - } - return MatchesUtils.disjunction(subMatches); - } - - @Override - public Query getQuery() { - throw new UnsupportedOperationException(); - } - - @Override - public int gaps() { - return 0; - } - - @Override - public int width() { - int width = endPosition() - startPosition() + 1; - for (MatchesIterator mi : subs) { - width = width - (mi.endPosition() - mi.startPosition() + 1); - } - return width; - } - } - } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index 2af850ee7f51..15eeb7dc1905 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -21,7 +21,9 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; +import java.util.HashMap; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.stream.Collectors; @@ -33,7 +35,19 @@ static IntervalsSource build(List sources) { if (sources.size() == 1) { return sources.get(0); } - return new UnorderedIntervalsSource(flatten(sources)); + return new UnorderedIntervalsSource(deduplicate(flatten(sources))); + } + + private static List deduplicate(List sources) { + Map counts = new HashMap<>(); + for (IntervalsSource source : sources) { + counts.compute(source, (k, v) -> v == null ? 1 : v + 1); + } + List deduplicated = new ArrayList<>(); + for (IntervalsSource source : counts.keySet()) { + deduplicated.add(DuplicateIntervalsSource.build(source, counts.get(source))); + } + return deduplicated; } private static List flatten(List sources) { @@ -94,9 +108,8 @@ private static class UnorderedIntervalIterator extends ConjunctionIntervalIterat private final PriorityQueue queue; private final IntervalIterator[] subIterators; - private final int[] innerPositions; - int start = -1, end = -1, firstEnd, queueEnd; + int start = -1, end = -1, slop, queueEnd; UnorderedIntervalIterator(List subIterators) { super(subIterators); @@ -107,7 +120,6 @@ protected boolean lessThan(IntervalIterator a, IntervalIterator b) { } }; this.subIterators = new IntervalIterator[subIterators.size()]; - this.innerPositions = new int[subIterators.size() * 2]; for (int i = 0; i < subIterators.size(); i++) { this.subIterators[i] = subIterators.get(i); @@ -146,8 +158,11 @@ public int nextInterval() throws IOException { // then, minimize it do { start = queue.top().start(); - firstEnd = queue.top().end(); end = queueEnd; + slop = width(); + for (IntervalIterator it : subIterators) { + slop -= it.width(); + } if (queue.top().end() == end) return start; IntervalIterator it = queue.pop(); @@ -161,22 +176,7 @@ public int nextInterval() throws IOException { @Override public int gaps() { - for (int i = 0; i < subIterators.length; i++) { - if (subIterators[i].end() > end) { - innerPositions[i * 2] = start; - innerPositions[i * 2 + 1] = firstEnd; - } - else { - innerPositions[i * 2] = subIterators[i].start(); - innerPositions[i * 2 + 1] = subIterators[i].end(); - } - } - Arrays.sort(innerPositions); - int gaps = 0; - for (int i = 1; i < subIterators.length; i++) { - gaps += (innerPositions[i * 2] - innerPositions[i * 2 - 1] - 1); - } - return gaps; + return slop; } @Override diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 194576662de5..5ab52f671bc5 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -363,6 +363,28 @@ public void testUnorderedNearIntervals() throws IOException { checkVisits(source, 3, "pease", "hot"); } + public void testUnorderedWithRepeats() throws IOException { + IntervalsSource source = Intervals.unordered(Intervals.term("pease"), Intervals.term("pease"), Intervals.term("hot")); + checkIntervals(source, "field1", 3, new int[][]{ + {}, { 0, 3, 2, 6, 3, 17 }, { 0, 5, 3, 6 }, {}, { 0, 3, 2, 6, 3, 17 }, {} + }); + MatchesIterator mi = getMatches(source, 1, "field1"); + assertMatch(mi, 0, 3, 0, 25); + MatchesIterator sub = mi.getSubMatches(); + assertNotNull(sub); + assertMatch(sub, 0, 0, 0, 5); + assertMatch(sub, 2, 2, 15, 18); + assertMatch(sub, 3, 3, 20, 25); + } + + public void testUnorderedWithRepeatsAndMaxGaps() throws IOException { + IntervalsSource source = Intervals.maxgaps(2, + Intervals.unordered(Intervals.term("pease"), Intervals.term("pease"), Intervals.term("hot"))); + checkIntervals(source, "field1", 3, new int[][]{ + {}, { 0, 3, 2, 6 }, { 3, 6 }, {}, { 0, 3, 2, 6 }, {} + }); + } + public void testIntervalDisjunction() throws IOException { IntervalsSource source = Intervals.or(Intervals.term("pease"), Intervals.term("hot"), Intervals.term("notMatching")); checkIntervals(source, "field1", 4, new int[][]{ From 7c805feaf3cd0c7b8385f34e08adaf136ee4743a Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Wed, 18 Dec 2019 09:54:34 +0000 Subject: [PATCH 3/6] javadocs, cleanups --- .../queries/intervals/IntervalIterator.java | 3 ++ .../intervals/IntervalMatchesIterator.java | 17 ++++---- .../MinimumShouldMatchIntervalsSource.java | 33 +++------------ .../intervals/OrderedIntervalsSource.java | 12 +++--- ...rce.java => RepeatingIntervalsSource.java} | 41 +++++++++++++++---- .../intervals/UnorderedIntervalsSource.java | 15 +++++-- .../queries/intervals/TestIntervals.java | 10 +++++ .../intervals/TestSimplifications.java | 14 +++++++ 8 files changed, 91 insertions(+), 54 deletions(-) rename lucene/queries/src/java/org/apache/lucene/queries/intervals/{DuplicateIntervalsSource.java => RepeatingIntervalsSource.java} (87%) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java index a1d8da4d4011..c5fbf2ace032 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalIterator.java @@ -72,6 +72,9 @@ public abstract class IntervalIterator extends DocIdSetIterator { */ public abstract int gaps(); + /** + * The width of the current interval + */ public int width() { return end() - start() + 1; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java index 06f057515181..63e5b5ec418e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatchesIterator.java @@ -20,21 +20,24 @@ import org.apache.lucene.search.MatchesIterator; /** - * An extension of MatchesIterator that allows the gaps from a wrapped - * IntervalIterator to be reported. + * An extension of MatchesIterator that allows it to be treated as + * an IntervalIterator * - * This is necessary because {@link MatchesIterator#getSubMatches()} returns - * the submatches of all nested matches as a flat iterator, but - * {@link IntervalIterator#gaps()} only returns the gaps between its immediate - * sub-matches, so we can't calculate the latter using the former. + * This is necessary to get access to {@link IntervalIterator#gaps()} + * and {@link IntervalIterator#width()} when constructing matches */ -interface IntervalMatchesIterator extends MatchesIterator { +public interface IntervalMatchesIterator extends MatchesIterator { /** * The number of top-level gaps inside the current match + * @see IntervalIterator#gaps() */ int gaps(); + /** + * The width of the current match + * @see IntervalIterator#width() + */ int width(); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java index 08e80f034baa..72ff1eabee3c 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/MinimumShouldMatchIntervalsSource.java @@ -155,10 +155,9 @@ static class MinimumShouldMatchIntervalIterator extends IntervalIterator { private final PriorityQueue backgroundQueue; private final float matchCost; private final int minShouldMatch; - private final int[] innerPositions; private final Collection currentIterators = new ArrayList<>(); - private int start, end, queueEnd, firstEnd; + private int start, end, queueEnd, slop; private IntervalIterator lead; MinimumShouldMatchIntervalIterator(Collection subs, int minShouldMatch) { @@ -171,7 +170,6 @@ static class MinimumShouldMatchIntervalIterator extends IntervalIterator { this.approximation = new DisjunctionDISIApproximation(disiQueue); this.matchCost = mc; this.minShouldMatch = minShouldMatch; - this.innerPositions = new int[minShouldMatch * 2]; this.proximityQueue = new PriorityQueue(minShouldMatch) { @Override @@ -199,29 +197,7 @@ public int end() { @Override public int gaps() { - int i = 0; - for (IntervalIterator it : proximityQueue) { - if (it.end() > end) { - innerPositions[i * 2] = start; - innerPositions[i * 2 + 1] = firstEnd; - } - else { - innerPositions[i * 2] = it.start(); - innerPositions[i * 2 + 1] = it.end(); - } - i++; - } - if (proximityQueue.size() < minShouldMatch) { - // the leading iterator has been exhausted and removed from the queue - innerPositions[i * 2] = start; - innerPositions[i * 2 + 1] = firstEnd; - } - Arrays.sort(innerPositions); - int gaps = 0; - for (int j = 1; j < minShouldMatch; j++) { - gaps += (innerPositions[j * 2] - innerPositions[j * 2 - 1] - 1); - } - return gaps; + return slop; } @Override @@ -242,8 +218,11 @@ public int nextInterval() throws IOException { // then, minimize it do { start = proximityQueue.top().start(); - firstEnd = proximityQueue.top().end(); end = queueEnd; + slop = width(); + for (IntervalIterator it : proximityQueue) { + slop -= it.width(); + } if (proximityQueue.top().end() == end) return start; lead = proximityQueue.pop(); diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java index 1bf6c37f52c1..6331688c0c82 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OrderedIntervalsSource.java @@ -19,16 +19,11 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.List; import java.util.Objects; import java.util.stream.Collectors; -import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.MatchesUtils; -import org.apache.lucene.search.Query; - class OrderedIntervalsSource extends ConjunctionIntervalsSource { static IntervalsSource build(List sources) { @@ -63,12 +58,15 @@ private static List deduplicate(List sources) current.add(source); } else { - deduplicated.add(DuplicateIntervalsSource.build(current.get(0), current.size())); + deduplicated.add(RepeatingIntervalsSource.build(current.get(0), current.size())); current.clear(); current.add(source); } } - deduplicated.add(DuplicateIntervalsSource.build(current.get(0), current.size())); + deduplicated.add(RepeatingIntervalsSource.build(current.get(0), current.size())); + if (deduplicated.size() == 1 && deduplicated.get(0) instanceof RepeatingIntervalsSource) { + ((RepeatingIntervalsSource)deduplicated.get(0)).setName("ORDERED"); + } return deduplicated; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java similarity index 87% rename from lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java rename to lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java index 1244f5262eea..50593dc7d647 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DuplicateIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java @@ -31,24 +31,40 @@ import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; -class DuplicateIntervalsSource extends IntervalsSource { +/** + * Generates an iterator that spans repeating instances of a sub-iterator, + * avoiding minimization. This is useful for repeated terms within an + * unordered interval, for example, ensuring that multiple iterators do + * not match on a single term. + * + * The generated iterators have a specialized {@link IntervalIterator#width()} + * implementation that sums up the widths of the individual sub-iterators, + * rather than just returning the full span of the iterator. + */ +class RepeatingIntervalsSource extends IntervalsSource { static IntervalsSource build(IntervalsSource in, int childCount) { if (childCount == 1) { return in; } assert childCount > 0; - return new DuplicateIntervalsSource(in, childCount); + return new RepeatingIntervalsSource(in, childCount); } final IntervalsSource in; final int childCount; - private DuplicateIntervalsSource(IntervalsSource in, int childCount) { + String name; + + private RepeatingIntervalsSource(IntervalsSource in, int childCount) { this.in = in; this.childCount = childCount; } + public void setName(String name) { + this.name = name; + } + @Override public IntervalIterator intervals(String field, LeafReaderContext ctx) throws IOException { IntervalIterator it = in.intervals(field, ctx); @@ -93,8 +109,8 @@ public int hashCode() { @Override public boolean equals(Object other) { - if (other instanceof DuplicateIntervalsSource == false) return false; - DuplicateIntervalsSource o = (DuplicateIntervalsSource) other; + if (other instanceof RepeatingIntervalsSource == false) return false; + RepeatingIntervalsSource o = (RepeatingIntervalsSource) other; return Objects.equals(this.in, o.in) && Objects.equals(this.childCount, o.childCount); } @@ -105,6 +121,9 @@ public String toString() { for (int i = 1; i < childCount; i++) { out.append(",").append(s); } + if (name != null) { + return name + "(" + out.toString() + ")"; + } return out.toString(); } @@ -145,7 +164,7 @@ public int width() { @Override public int gaps() { - throw new UnsupportedOperationException(); + return super.width() - width(); } @Override @@ -286,14 +305,18 @@ public Query getQuery() { @Override public int gaps() { - return 0; + int width = endPosition() - startPosition() + 1; + for (MatchesIterator mi : subs) { + width = width - (mi.endPosition() - mi.startPosition() + 1); + } + return width; } @Override public int width() { - int width = endPosition() - startPosition() + 1; + int width = 0; for (MatchesIterator mi : subs) { - width = width - (mi.endPosition() - mi.startPosition() + 1); + width += (mi.endPosition() - mi.startPosition() + 1); } return width; } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index 15eeb7dc1905..cf1aeaa00a12 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -19,9 +19,9 @@ import java.io.IOException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collection; import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.List; import java.util.Map; import java.util.Objects; @@ -35,17 +35,24 @@ static IntervalsSource build(List sources) { if (sources.size() == 1) { return sources.get(0); } - return new UnorderedIntervalsSource(deduplicate(flatten(sources))); + List rewritten = deduplicate(flatten(sources)); + if (rewritten.size() == 1) { + return rewritten.get(0); + } + return new UnorderedIntervalsSource(rewritten); } private static List deduplicate(List sources) { - Map counts = new HashMap<>(); + Map counts = new LinkedHashMap<>(); // preserve order for testing for (IntervalsSource source : sources) { counts.compute(source, (k, v) -> v == null ? 1 : v + 1); } List deduplicated = new ArrayList<>(); for (IntervalsSource source : counts.keySet()) { - deduplicated.add(DuplicateIntervalsSource.build(source, counts.get(source))); + deduplicated.add(RepeatingIntervalsSource.build(source, counts.get(source))); + } + if (deduplicated.size() == 1 && deduplicated.get(0) instanceof RepeatingIntervalsSource) { + ((RepeatingIntervalsSource)deduplicated.get(0)).setName("UNORDERED"); } return deduplicated; } diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java index 5ab52f671bc5..a2d783ebbf8d 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestIntervals.java @@ -682,6 +682,16 @@ public void testMaxGapsWithRepeats() throws IOException { assertGaps(source, 2, "field1", new int[]{ 3 }); } + public void testMaxGapsWithOnlyRepeats() throws IOException { + IntervalsSource source = Intervals.maxgaps(1, Intervals.ordered( + Intervals.or(Intervals.term("pease"), Intervals.term("hot")), Intervals.or(Intervals.term("pease"), Intervals.term("hot")) + )); + checkIntervals(source, "field1", 3, new int[][]{ + {}, { 0, 2, 2, 3 }, { 3, 5, 5, 6 }, {}, { 0, 2, 2, 3 }, {} + }); + assertGaps(source, 1, "field1", new int[]{ 1, 0 }); + } + public void testNestedMaxGaps() throws IOException { IntervalsSource source = Intervals.maxgaps(1, Intervals.unordered( diff --git a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestSimplifications.java b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestSimplifications.java index b54c2a1b56bb..76a4857da386 100644 --- a/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestSimplifications.java +++ b/lucene/queries/src/test/org/apache/lucene/queries/intervals/TestSimplifications.java @@ -38,12 +38,26 @@ public void testOrdered() { assertEquals(Intervals.term("term"), actual); } + public void testOrderedWithDuplicates() { + IntervalsSource actual = Intervals.ordered(Intervals.term("term"), Intervals.term("term")); + assertEquals("ORDERED(term,term)", actual.toString()); + actual = Intervals.ordered(Intervals.term("term"), Intervals.term("term"), Intervals.term("bar")); + assertEquals("ORDERED(term,term,bar)", actual.toString()); + } + public void testUnordered() { // UNORDERED(term) => term IntervalsSource actual = Intervals.unordered(Intervals.term("term")); assertEquals(Intervals.term("term"), actual); } + public void testUnorderedWithDuplicates() { + IntervalsSource actual = Intervals.unordered(Intervals.term("term"), Intervals.term("term")); + assertEquals("UNORDERED(term,term)", actual.toString()); + actual = Intervals.unordered(Intervals.term("term"), Intervals.term("term"), Intervals.term("bar")); + assertEquals("UNORDERED(term,term,bar)", actual.toString()); + } + public void testUnorderedOverlaps() { // UNORDERED_NO_OVERLAPS(term, term) => ORDERED(term, term) IntervalsSource actual = Intervals.unorderedNoOverlaps(Intervals.term("term"), Intervals.term("term")); From 39745c329f7d24897d1fe1cc655d8bd7c4b4a40a Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 6 Feb 2020 12:30:44 +0000 Subject: [PATCH 4/6] imports --- .../lucene/queries/intervals/DifferenceIntervalsSource.java | 1 - .../lucene/queries/intervals/DisjunctionIntervalsSource.java | 1 - .../apache/lucene/queries/intervals/ExtendedIntervalsSource.java | 1 - .../apache/lucene/queries/intervals/FilteredIntervalsSource.java | 1 - .../lucene/queries/intervals/FixedFieldIntervalsSource.java | 1 - .../org/apache/lucene/queries/intervals/IntervalMatches.java | 1 + .../apache/lucene/queries/intervals/OffsetIntervalsSource.java | 1 - .../lucene/queries/intervals/UnorderedIntervalsSource.java | 1 - 8 files changed, 1 insertion(+), 7 deletions(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java index ad13667c94e5..41149f991635 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DifferenceIntervalsSource.java @@ -21,7 +21,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; abstract class DifferenceIntervalsSource extends IntervalsSource { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java index 9e74838648f1..5a1e6728d90d 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java @@ -31,7 +31,6 @@ import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.DocIdSetIterator; import org.apache.lucene.search.MatchesIterator; -import org.apache.lucene.search.MatchesUtils; import org.apache.lucene.search.Query; import org.apache.lucene.search.QueryVisitor; import org.apache.lucene.util.PriorityQueue; diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java index 4a67aa14a72e..27f57735ba85 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/ExtendedIntervalsSource.java @@ -24,7 +24,6 @@ import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; class ExtendedIntervalsSource extends IntervalsSource { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java index 3118095af4ca..adf147e0e26e 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FilteredIntervalsSource.java @@ -24,7 +24,6 @@ import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; /** diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java index c8ce33a36a9a..95dea3a97669 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/FixedFieldIntervalsSource.java @@ -24,7 +24,6 @@ import java.util.stream.Collectors; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; class FixedFieldIntervalsSource extends IntervalsSource { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java index 99c79c302300..82d9d093a293 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/IntervalMatches.java @@ -161,6 +161,7 @@ public int nextDoc() { state = State.ITERATING; return doc; case ITERATING: + case NO_MORE_INTERVALS: state = State.EXHAUSTED; case EXHAUSTED: } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java index 370d59b532a4..1f96b40a5b28 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/OffsetIntervalsSource.java @@ -24,7 +24,6 @@ import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.BooleanClause; -import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.QueryVisitor; /** diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index cf1aeaa00a12..e5dfeac96083 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; From 91c257964afa72aaf49626c9e85a144ba21c969a Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 6 Feb 2020 13:59:13 +0000 Subject: [PATCH 5/6] changes --- lucene/CHANGES.txt | 7 +++++++ .../queries/intervals/DisjunctionIntervalsSource.java | 11 +---------- .../queries/intervals/RepeatingIntervalsSource.java | 1 - .../queries/intervals/UnorderedIntervalsSource.java | 1 + 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 269e3dda81da..32281ed0ac69 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -173,6 +173,13 @@ Bug Fixes * LUCENE-9200: Fix TieredMergePolicy to use double (not float) math to make its merging decisions, fixing a corner-case bug uncovered by fun randomized tests (Robert Muir, Mike McCandless) +* LUCENE-9099: Unordered and Ordered interval queries now correctly handle + repeated subterms - ordered intervals could supply an 'extra' minimized + interval, resulting in odd matches when combined with eg CONTAINS queries; + and unordered intervals would match duplicate subterms on the same position, + so an query for UNORDERED(foo, foo) would match a document containing 'foo' + only once. (Alan Woodward) + Other --------------------- diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java index 5a1e6728d90d..f4ee1970823d 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/DisjunctionIntervalsSource.java @@ -205,12 +205,7 @@ private void reset() throws IOException { } int currentOrd() { - if (current == EMPTY) { - return -1; - } - if (current == EXHAUSTED) { - return NO_MORE_INTERVALS; - } + assert current != EMPTY && current != EXHAUSTED; for (int i = 0; i < iterators.size(); i++) { if (iterators.get(i) == current) { return i; @@ -395,28 +390,24 @@ public int endPosition() { @Override public int startOffset() throws IOException { int ord = it.currentOrd(); - assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; return subs.get(ord).startOffset(); } @Override public int endOffset() throws IOException { int ord = it.currentOrd(); - assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; return subs.get(ord).endOffset(); } @Override public MatchesIterator getSubMatches() throws IOException { int ord = it.currentOrd(); - assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; return subs.get(ord).getSubMatches(); } @Override public Query getQuery() { int ord = it.currentOrd(); - assert ord != -1 && ord != IntervalIterator.NO_MORE_INTERVALS; return subs.get(ord).getQuery(); } diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java index 50593dc7d647..1a5c891b3d94 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/RepeatingIntervalsSource.java @@ -53,7 +53,6 @@ static IntervalsSource build(IntervalsSource in, int childCount) { final IntervalsSource in; final int childCount; - String name; private RepeatingIntervalsSource(IntervalsSource in, int childCount) { diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index e5dfeac96083..cf1aeaa00a12 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; +import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map; From 4186a20d4098869e0e747f160985846e25e5f53c Mon Sep 17 00:00:00 2001 From: Alan Woodward Date: Thu, 6 Feb 2020 14:13:01 +0000 Subject: [PATCH 6/6] imports *again*, sake --- .../lucene/queries/intervals/UnorderedIntervalsSource.java | 1 - 1 file changed, 1 deletion(-) diff --git a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java index cf1aeaa00a12..e5dfeac96083 100644 --- a/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java +++ b/lucene/queries/src/java/org/apache/lucene/queries/intervals/UnorderedIntervalsSource.java @@ -20,7 +20,6 @@ import java.io.IOException; import java.util.ArrayList; import java.util.Collection; -import java.util.HashMap; import java.util.LinkedHashMap; import java.util.List; import java.util.Map;