Skip to content

Commit

Permalink
LUCENE-10078: Enable merge-on-refresh by default. (#921)
Browse files Browse the repository at this point in the history
This gives implementations of `findFullFlushMerges` to `LogMergePolicy` and
`TieredMergePolicy` and enables merge-on-refresh with a default timeout of
500ms.

The idea behind the 500ms default is that it felt both high-enough to have time
to run merges of small segments, and low enough that the freshness of the data
wouldn't look badly affected for users who have high refresh rates (e.g.
refreshing every second).

In both cases, `findFullFlushMerges` delegates to `findMerges` and filters
merges whose segments are all below the min/floor size.
  • Loading branch information
jpountz authored Jun 7, 2022
1 parent 7e9d5ab commit b5795db
Show file tree
Hide file tree
Showing 15 changed files with 192 additions and 25 deletions.
3 changes: 3 additions & 0 deletions lucene/CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ New Features
Improvements
---------------------

* LUCENE-10078: Merge on full flush is now enabled by default with a timeout of
500ms. (Adrien Grand)

* LUCENE-10585: Facet module code cleanup (copy/paste scrubbing, simplification and some very minor
optimization tweaks). (Greg Miller)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -124,4 +124,9 @@ public int numDeletesToMerge(
public MergePolicy unwrap() {
return in;
}

@Override
protected long maxFullFlushMergeSize() {
return in.maxFullFlushMergeSize();
}
}
8 changes: 8 additions & 0 deletions lucene/core/src/java/org/apache/lucene/index/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -2372,6 +2372,10 @@ public synchronized Set<SegmentCommitInfo> getMergingSegments() {
* @lucene.experimental
*/
private synchronized MergePolicy.OneMerge getNextMerge() {
if (tragedy.get() != null) {
throw new IllegalStateException(
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
}
if (pendingMerges.size() == 0) {
return null;
} else {
Expand All @@ -2388,6 +2392,10 @@ private synchronized MergePolicy.OneMerge getNextMerge() {
* @lucene.experimental
*/
public synchronized boolean hasPendingMerges() {
if (tragedy.get() != null) {
throw new IllegalStateException(
"this writer hit an unrecoverable error; cannot merge", tragedy.get());
}
return pendingMerges.size() != 0;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ public enum OpenMode {
* Default value for time to wait for merges on commit or getReader (when using a {@link
* MergePolicy} that implements {@link MergePolicy#findFullFlushMerges}).
*/
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 0;
public static final long DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS = 500;

// indicates whether this config instance is already attached to a writer.
// not final so that it can be cloned properly.
Expand Down Expand Up @@ -457,9 +457,14 @@ public IndexWriterConfig setCommitOnClose(boolean commitOnClose) {
* call, like natural segment merges. The default is <code>
* {@value IndexWriterConfig#DEFAULT_MAX_FULL_FLUSH_MERGE_WAIT_MILLIS}</code>.
*
* <p>Note: This settings has no effect unless {@link
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)} has an
* implementation that actually returns merges which by default doesn't return any merges.
* <p>Note: Which segments would get merged depends on the implementation of {@link
* MergePolicy#findFullFlushMerges(MergeTrigger, SegmentInfos, MergePolicy.MergeContext)}
*
* <p>Note: Set to 0 to disable merging on full flush.
*
* <p>Note: If {@link SerialMergeScheduler} is used and a non-zero timout is configured,
* full-flush merges will always wait for the merge to finish without honoring the configured
* timeout.
*/
public IndexWriterConfig setMaxFullFlushMergeWaitMillis(long maxFullFlushMergeWaitMillis) {
this.maxFullFlushMergeWaitMillis = maxFullFlushMergeWaitMillis;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,9 +96,10 @@ public double getMaxMergeMBForForcedMerge() {
}

/**
* Sets the minimum size for the lowest level segments. Any segments below this size will be
* merged more aggressively in order to avoid having a long tail of small segments. Large values
* of this parameter increase the merging cost during indexing if you flush small segments.
* Sets the minimum size for the lowest level segments. Any segments below this size are
* candidates for full-flush merges and be merged more aggressively in order to avoid having a
* long tail of small segments. Large values of this parameter increase the merging cost during
* indexing if you flush small segments.
*/
public void setMinMergeMB(double mb) {
minMergeSize = (long) (mb * 1024 * 1024);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -43,9 +43,10 @@ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IO
}

/**
* Sets the minimum size for the lowest level segments. Any segments below this size will be
* merged more aggressively in order to avoid having a long tail of small segments. Large values
* of this parameter increase the merging cost during indexing if you flush small segments.
* Sets the minimum size for the lowest level segments. Any segments below this size are
* candidates for full-flush merges and merged more aggressively in order to avoid having a long
* tail of small segments. Large values of this parameter increase the merging cost during
* indexing if you flush small segments.
*/
public void setMinMergeDocs(int minMergeDocs) {
minMergeSize = minMergeDocs;
Expand Down
13 changes: 12 additions & 1 deletion lucene/core/src/java/org/apache/lucene/index/LogMergePolicy.java
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,9 @@
* specifies how a segment's size is determined. {@link LogDocMergePolicy} is one subclass that
* measures size by document count in the segment. {@link LogByteSizeMergePolicy} is another
* subclass that measures size as the total byte size of the file(s) for the segment.
*
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link #minMergeSize
* minimum merge size} for {@link #findFullFlushMerges full-flush merges}.
*/
public abstract class LogMergePolicy extends MergePolicy {

Expand Down Expand Up @@ -64,7 +67,10 @@ public abstract class LogMergePolicy extends MergePolicy {
/** How many segments to merge at a time. */
protected int mergeFactor = DEFAULT_MERGE_FACTOR;

/** Any segments whose size is smaller than this value will be merged more aggressively. */
/**
* Any segments whose size is smaller than this value will be candidates for full-flush merges and
* merged more aggressively.
*/
protected long minMergeSize;

/** If the size of a segment exceeds this value then it will never be merged. */
Expand Down Expand Up @@ -178,6 +184,11 @@ protected boolean isMerged(
&& (numToMerge != 1 || !segmentIsOriginal || isMerged(infos, mergeInfo, mergeContext));
}

@Override
protected long maxFullFlushMergeSize() {
return minMergeSize;
}

/**
* Returns the merges necessary to merge the index, taking the max merge size or max merge docs
* into consideration. This method attempts to respect the {@code maxNumSegments} parameter,
Expand Down
37 changes: 33 additions & 4 deletions lucene/core/src/java/org/apache/lucene/index/MergePolicy.java
Original file line number Diff line number Diff line change
Expand Up @@ -601,9 +601,9 @@ public abstract MergeSpecification findForcedDeletesMerges(
SegmentInfos segmentInfos, MergeContext mergeContext) throws IOException;

/**
* Identifies merges that we want to execute (synchronously) on commit. By default, this will do
* no merging on commit. If you implement this method in your {@code MergePolicy} you must also
* set a non-zero timeout using {@link IndexWriterConfig#setMaxFullFlushMergeWaitMillis}.
* Identifies merges that we want to execute (synchronously) on commit. By default, this will
* return {@link #findMerges natural merges} whose segments are all less than the {@link
* #maxFullFlushMergeSize() max segment size for full flushes}.
*
* <p>Any merges returned here will make {@link IndexWriter#commit()}, {@link
* IndexWriter#prepareCommit()} or {@link IndexWriter#getReader(boolean, boolean)} block until the
Expand All @@ -628,7 +628,28 @@ public abstract MergeSpecification findForcedDeletesMerges(
public MergeSpecification findFullFlushMerges(
MergeTrigger mergeTrigger, SegmentInfos segmentInfos, MergeContext mergeContext)
throws IOException {
return null;
// This returns natural merges that contain segments below the minimum size
MergeSpecification mergeSpec = findMerges(mergeTrigger, segmentInfos, mergeContext);
if (mergeSpec == null) {
return null;
}
MergeSpecification newMergeSpec = null;
for (OneMerge oneMerge : mergeSpec.merges) {
boolean belowMaxFullFlushSize = true;
for (SegmentCommitInfo sci : oneMerge.segments) {
if (size(sci, mergeContext) >= maxFullFlushMergeSize()) {
belowMaxFullFlushSize = false;
break;
}
}
if (belowMaxFullFlushSize) {
if (newMergeSpec == null) {
newMergeSpec = new MergeSpecification();
}
newMergeSpec.add(oneMerge);
}
}
return newMergeSpec;
}

/**
Expand Down Expand Up @@ -671,6 +692,14 @@ protected long size(SegmentCommitInfo info, MergeContext mergeContext) throws IO
return (info.info.maxDoc() <= 0 ? byteSize : (long) (byteSize * (1.0 - delRatio)));
}

/**
* Return the maximum size of segments to be included in full-flush merges by the default
* implementation of {@link #findFullFlushMerges}.
*/
protected long maxFullFlushMergeSize() {
return 0L;
}

/** Asserts that the delCount for this SegmentCommitInfo is valid */
protected final boolean assertDelCount(int delCount, SegmentCommitInfo info) {
assert delCount >= 0 : "delCount must be positive: " + delCount;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,10 @@
*
* <p>findForcedDeletesMerges should never produce segments greater than maxSegmentSize.
*
* <p><b>NOTE</b>: This policy returns natural merges whose size is below the {@link
* #setFloorSegmentMB(double) floor segment size} for {@link #findFullFlushMerges full-flush
* merges}.
*
* @lucene.experimental
*/

Expand Down Expand Up @@ -168,9 +172,16 @@ public double getDeletesPctAllowed() {
}

/**
* Segments smaller than this are "rounded up" to this size, ie treated as equal (floor) size for
* merge selection. This is to prevent frequent flushing of tiny segments from allowing a long
* tail in the index. Default is 2 MB.
* Segments smaller than this size are merged more aggressively:
*
* <ul>
* <li>They are candidates for full-flush merges, in order to reduce the number of segments in
* the index prior to opening a new point-in-time view of the index.
* <li>For background merges, smaller segments are "rounded up" to this size.
* </ul>
*
* In both cases, this helps prevent frequent flushing of tiny segments to create a long tail of
* small segments in the index. Default is 2MB.
*/
public TieredMergePolicy setFloorSegmentMB(double v) {
if (v <= 0.0) {
Expand All @@ -190,6 +201,11 @@ public double getFloorSegmentMB() {
return floorSegmentBytes / (1024 * 1024.);
}

@Override
protected long maxFullFlushMergeSize() {
return floorSegmentBytes;
}

/**
* When forceMergeDeletes is called, we only merge away a segment if its delete percentage is over
* this threshold. Default is 10%.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -815,7 +815,8 @@ public void testNPEAfterInvalidReindex2() throws Exception {
/** test reopening backwards from a non-NRT reader (with document deletes) */
public void testNRTMdeletes() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriterConfig iwc =
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
SnapshotDeletionPolicy snapshotter =
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
iwc.setIndexDeletionPolicy(snapshotter);
Expand Down Expand Up @@ -865,7 +866,8 @@ public void testNRTMdeletes() throws Exception {
/** test reopening backwards from an NRT reader (with document deletes) */
public void testNRTMdeletes2() throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random()));
IndexWriterConfig iwc =
new IndexWriterConfig(new MockAnalyzer(random())).setMergePolicy(NoMergePolicy.INSTANCE);
SnapshotDeletionPolicy snapshotter =
new SnapshotDeletionPolicy(new KeepOnlyLastCommitDeletionPolicy());
iwc.setIndexDeletionPolicy(snapshotter);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
public class TestLogMergePolicy extends BaseMergePolicyTestCase {

@Override
public MergePolicy mergePolicy() {
public LogMergePolicy mergePolicy() {
return newLogMergePolicy(random());
}

Expand Down Expand Up @@ -187,4 +187,31 @@ public void testRejectUnbalancedMerges() throws IOException {
assertEquals(100, segmentInfos.info(0).info.maxDoc());
assertEquals(10, segmentInfos.info(1).info.maxDoc());
}

public void testFullFlushMerges() throws IOException {
AtomicLong segNameGenerator = new AtomicLong();
IOStats stats = new IOStats();
MergeContext mergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount);
SegmentInfos segmentInfos = new SegmentInfos(Version.LATEST.major);

LogMergePolicy mp = mergePolicy();

for (int i = 0; i < mp.getMergeFactor(); ++i) {
segmentInfos.add(
makeSegmentCommitInfo(
"_" + segNameGenerator.getAndIncrement(),
1,
0,
Double.MIN_VALUE,
IndexWriter.SOURCE_FLUSH));
}
MergeSpecification spec =
mp.findFullFlushMerges(MergeTrigger.FULL_FLUSH, segmentInfos, mergeContext);
assertNotNull(spec);
for (OneMerge merge : spec.merges) {
segmentInfos =
applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats);
}
assertEquals(1, segmentInfos.size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,11 @@
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.index.MergePolicy.MergeContext;
import org.apache.lucene.index.MergePolicy.MergeSpecification;
import org.apache.lucene.index.MergePolicy.OneMerge;
import org.apache.lucene.store.Directory;
Expand Down Expand Up @@ -917,4 +919,31 @@ public void testSimulateUpdates() throws IOException {
int numDocs = TEST_NIGHTLY ? atLeast(10_000_000) : atLeast(1_000_000);
doTestSimulateUpdates(mergePolicy, numDocs, 2500);
}

public void testFullFlushMerges() throws IOException {
AtomicLong segNameGenerator = new AtomicLong();
IOStats stats = new IOStats();
MergeContext mergeContext = new MockMergeContext(SegmentCommitInfo::getDelCount);
SegmentInfos segmentInfos = new SegmentInfos(Version.LATEST.major);

TieredMergePolicy mp = new TieredMergePolicy();

for (int i = 0; i < 11; ++i) {
segmentInfos.add(
makeSegmentCommitInfo(
"_" + segNameGenerator.getAndIncrement(),
1,
0,
Double.MIN_VALUE,
IndexWriter.SOURCE_FLUSH));
}
MergeSpecification spec =
mp.findFullFlushMerges(MergeTrigger.FULL_FLUSH, segmentInfos, mergeContext);
assertNotNull(spec);
for (OneMerge merge : spec.merges) {
segmentInfos =
applyMerge(segmentInfos, merge, "_" + segNameGenerator.getAndIncrement(), stats);
}
assertEquals(2, segmentInfos.size());
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -104,8 +104,17 @@ public void run() {

// LUCENE-7570
public void testDeadlockStalledMerges() throws Exception {
doTestDeadlockStalledMerges(false);
}

public void testDeadlockStalledFullFlushMerges() throws Exception {
doTestDeadlockStalledMerges(true);
}

private void doTestDeadlockStalledMerges(boolean mergeOnFlush) throws Exception {
Directory dir = newDirectory();
IndexWriterConfig iwc = new IndexWriterConfig();
IndexWriterConfig iwc =
new IndexWriterConfig().setMaxFullFlushMergeWaitMillis(mergeOnFlush ? 1000 : 0);

// so we merge every 2 segments:
LogMergePolicy mp = new LogDocMergePolicy();
Expand Down Expand Up @@ -163,7 +172,8 @@ protected void mergeSuccess(MergePolicy.OneMerge merge) {
w.addDocument(new Document());
// w writes third segment
w.addDocument(new Document());
w.commit();
IllegalStateException e = expectThrows(IllegalStateException.class, () -> w.commit());
assertTrue(e.getMessage(), e.getMessage().startsWith("this writer hit an unrecoverable error"));
// w writes fourth segment, and commit flushes and kicks off merge that stalls
w.close();
dir.close();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -428,7 +428,10 @@ protected void doTestSimulateAppendOnly(
IndexWriter.SOURCE_FLUSH));

MergeSpecification merges =
mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
mergePolicy.findFullFlushMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
if (merges == null) {
merges = mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
}
while (merges != null) {
assertTrue(merges.merges.size() > 0);
assertMerge(mergePolicy, merges);
Expand Down Expand Up @@ -490,7 +493,10 @@ protected void doTestSimulateUpdates(MergePolicy mergePolicy, int totalDocs, int
flushSize,
IndexWriter.SOURCE_FLUSH));
MergeSpecification merges =
mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
mergePolicy.findFullFlushMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
if (merges == null) {
merges = mergePolicy.findMerges(MergeTrigger.SEGMENT_FLUSH, segmentInfos, mergeContext);
}
while (merges != null) {
assertMerge(mergePolicy, merges);
for (OneMerge oneMerge : merges.merges) {
Expand Down
Loading

0 comments on commit b5795db

Please sign in to comment.