Skip to content

Commit

Permalink
Create peer-recovery retention leases
Browse files Browse the repository at this point in the history
This creates a peer-recovery retention lease for every shard during recovery,
ensuring that the replication group retains history for future peer recoveries.
It also ensures that leases for active shard copies do not expire, and leases
for inactive shard copies expire immediately if the shard is fully-allocated.

Relates elastic#41536
  • Loading branch information
DaveCTurner committed Jun 13, 2019
1 parent 4757870 commit 01ad532
Show file tree
Hide file tree
Showing 21 changed files with 588 additions and 144 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.elasticsearch.Version;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.support.replication.ReplicationResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.AllocationId;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
Expand Down Expand Up @@ -217,10 +218,22 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
// the primary calculates the non-expired retention leases and syncs them to replicas
final long currentTimeMillis = currentTimeMillisSupplier.getAsLong();
final long retentionLeaseMillis = indexSettings.getRetentionLeaseMillis();
final Set<String> leaseIdsForCurrentPeers
= routingTable.assignedShards().stream().map(ReplicationTracker::getPeerRecoveryRetentionLeaseId).collect(Collectors.toSet());
final Map<Boolean, List<RetentionLease>> partitionByExpiration = retentionLeases
.leases()
.stream()
.collect(Collectors.groupingBy(lease -> currentTimeMillis - lease.timestamp() > retentionLeaseMillis));
.collect(Collectors.groupingBy(lease -> {
if (lease.source().equals(PEER_RECOVERY_RETENTION_LEASE_SOURCE)) {
if (leaseIdsForCurrentPeers.contains(lease.id())) {
return false;
}
if (routingTable.allShardsStarted()) {
return true;
}
}
return currentTimeMillis - lease.timestamp() > retentionLeaseMillis;
}));
final Collection<RetentionLease> expiredLeases = partitionByExpiration.get(true);
if (expiredLeases == null) {
// early out as no retention leases have expired
Expand All @@ -242,7 +255,7 @@ public synchronized Tuple<Boolean, RetentionLeases> getRetentionLeases(final boo
* @param source the source of the retention lease
* @param listener the callback when the retention lease is successfully added and synced to replicas
* @return the new retention lease
* @throws IllegalArgumentException if the specified retention lease already exists
* @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
*/
public RetentionLease addRetentionLease(
final String id,
Expand All @@ -253,30 +266,46 @@ public RetentionLease addRetentionLease(
final RetentionLease retentionLease;
final RetentionLeases currentRetentionLeases;
synchronized (this) {
assert primaryMode;
if (retentionLeases.contains(id)) {
throw new RetentionLeaseAlreadyExistsException(id);
}
retentionLease = new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
retentionLeases = new RetentionLeases(
operationPrimaryTerm,
retentionLeases.version() + 1,
Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
retentionLease = innerAddRetentionLease(id, retainingSequenceNumber, source);
currentRetentionLeases = retentionLeases;
}
onSyncRetentionLeases.accept(currentRetentionLeases, listener);
return retentionLease;
}

/**
* Adds a new retention lease, but does not synchronise it with the rest of the replication group.
*
* @param id the identifier of the retention lease
* @param retainingSequenceNumber the retaining sequence number
* @param source the source of the retention lease
* @return the new retention lease
* @throws RetentionLeaseAlreadyExistsException if the specified retention lease already exists
*/
private RetentionLease innerAddRetentionLease(String id, long retainingSequenceNumber, String source) {
assert Thread.holdsLock(this);
assert primaryMode : id + "/" + retainingSequenceNumber + "/" + source;
if (retentionLeases.contains(id)) {
throw new RetentionLeaseAlreadyExistsException(id);
}
final RetentionLease retentionLease
= new RetentionLease(id, retainingSequenceNumber, currentTimeMillisSupplier.getAsLong(), source);
logger.debug("adding new retention lease [{}] to current retention leases [{}]", retentionLease, retentionLeases);
retentionLeases = new RetentionLeases(
operationPrimaryTerm,
retentionLeases.version() + 1,
Stream.concat(retentionLeases.leases().stream(), Stream.of(retentionLease)).collect(Collectors.toList()));
return retentionLease;
}

/**
* Renews an existing retention lease.
*
* @param id the identifier of the retention lease
* @param retainingSequenceNumber the retaining sequence number
* @param source the source of the retention lease
* @return the renewed retention lease
* @throws IllegalArgumentException if the specified retention lease does not exist
* @throws RetentionLeaseNotFoundException if the specified retention lease does not exist
*/
public synchronized RetentionLease renewRetentionLease(final String id, final long retainingSequenceNumber, final String source) {
assert primaryMode;
Expand Down Expand Up @@ -390,6 +419,45 @@ public boolean assertRetentionLeasesPersisted(final Path path) throws IOExceptio
return true;
}


/**
* Retention leases for peer recovery have source {@link ReplicationTracker#PEER_RECOVERY_RETENTION_LEASE_SOURCE}, a lease ID
* containing the persistent node ID calculated by {@link ReplicationTracker#getPeerRecoveryRetentionLeaseId}, and retain operations
* with sequence numbers strictly greater than the given global checkpoint.
*/
public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
addRetentionLease(getPeerRecoveryRetentionLeaseId(nodeId), globalCheckpoint + 1, PEER_RECOVERY_RETENTION_LEASE_SOURCE, listener);
}

/**
* Source for peer recovery retention leases; see {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
*/
public static final String PEER_RECOVERY_RETENTION_LEASE_SOURCE = "peer recovery";

/**
* Id for a peer recovery retention lease for the given node. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
*/
static String getPeerRecoveryRetentionLeaseId(String nodeId) {
return "peer_recovery/" + nodeId;
}

/**
* Id for a peer recovery retention lease for the given {@link ShardRouting}.
* See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
*/
public static String getPeerRecoveryRetentionLeaseId(ShardRouting shardRouting) {
return getPeerRecoveryRetentionLeaseId(shardRouting.currentNodeId());
}

/**
* Renew the peer-recovery retention lease for the given shard, advancing the retained sequence number to discard operations up to the
* given global checkpoint. See {@link ReplicationTracker#addPeerRecoveryRetentionLease}.
*/
public void renewPeerRecoveryRetentionLease(ShardRouting shardRouting, long globalCheckpoint) {
assert primaryMode;
renewRetentionLease(getPeerRecoveryRetentionLeaseId(shardRouting), globalCheckpoint + 1, PEER_RECOVERY_RETENTION_LEASE_SOURCE);
}

public static class CheckpointState implements Writeable {

/**
Expand Down Expand Up @@ -616,6 +684,22 @@ private boolean invariant() {
assert checkpoints.get(aId) != null : "aId [" + aId + "] is pending in sync but isn't tracked";
}

if (primaryMode
&& indexSettings.isSoftDeleteEnabled()
&& indexSettings.getIndexMetaData().getState() == IndexMetaData.State.OPEN
&& indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
// all tracked shard copies have a corresponding peer-recovery retention lease
for (final ShardRouting shardRouting : routingTable.assignedShards()) {
assert checkpoints.get(shardRouting.allocationId().getId()).tracked == false
|| retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting)) :
"no retention lease for tracked shard " + shardRouting + " in " + retentionLeases;
assert shardRouting.relocating() == false
|| checkpoints.get(shardRouting.allocationId().getRelocationId()).tracked == false
|| retentionLeases.contains(getPeerRecoveryRetentionLeaseId(shardRouting.getTargetRelocatingShard()))
: "no retention lease for relocation target " + shardRouting + " in " + retentionLeases;
}
}

return true;
}

Expand Down Expand Up @@ -669,6 +753,7 @@ public ReplicationTracker(
this.pendingInSync = new HashSet<>();
this.routingTable = null;
this.replicationGroup = null;
assert Version.V_EMPTY.equals(indexSettings.getIndexVersionCreated()) == false;
assert invariant();
}

Expand Down Expand Up @@ -772,6 +857,31 @@ public synchronized void activatePrimaryMode(final long localCheckpoint) {
primaryMode = true;
updateLocalCheckpoint(shardAllocationId, checkpoints.get(shardAllocationId), localCheckpoint);
updateGlobalCheckpointOnPrimary();

if (indexSettings.isSoftDeleteEnabled()) {
final ShardRouting primaryShard = routingTable.primaryShard();
final String leaseId = getPeerRecoveryRetentionLeaseId(primaryShard);
if (retentionLeases.get(leaseId) == null) {
/*
* We might have got here here via a rolling upgrade from an older version that doesn't create peer recovery retention
* leases for every shard copy, but in this case we do not expect any leases to exist.
*/
if (indexSettings.getIndexVersionCreated().onOrAfter(Version.V_8_0_0)) {
// We are starting up the whole replication group from scratch: if we were not (i.e. this is a replica promotion) then
// this copy must already be in-sync and active and therefore holds a retention lease for itself.
assert routingTable.activeShards().equals(Collections.singletonList(primaryShard)) : routingTable.activeShards();
assert primaryShard.allocationId().getId().equals(shardAllocationId)
: routingTable.activeShards() + " vs " + shardAllocationId;
assert replicationGroup.getReplicationTargets().equals(Collections.singletonList(primaryShard));

// Safe to call innerAddRetentionLease() without a subsequent sync since there are no other members of this replication
// group.
innerAddRetentionLease(leaseId, Math.max(0L, checkpoints.get(shardAllocationId).globalCheckpoint + 1),
PEER_RECOVERY_RETENTION_LEASE_SOURCE);
}
}
}

assert invariant();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,4 +211,7 @@ public String toString() {
'}';
}

public boolean isNotPeerRecoveryRetentionLease() {
return ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals(source) == false;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ public interface RetentionLeaseSyncer {
RetentionLeaseSyncer EMPTY = new RetentionLeaseSyncer() {
@Override
public void sync(final ShardId shardId, final RetentionLeases retentionLeases, final ActionListener<ReplicationResponse> listener) {

listener.onResponse(new ReplicationResponse());
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -275,13 +275,20 @@ private static Map<String, RetentionLease> toMap(final Collection<RetentionLease
}

/**
* A utility method to convert a retention lease collection to a map from retention lease ID to retention lease.
* A utility method to convert a retention lease collection to a map from retention lease ID to retention lease and exclude
* the automatically-added peer-recovery retention leases
*
* @param retentionLeases the retention lease collection
* @return the map from retention lease ID to retention lease
*/
static Map<String, RetentionLease> toMap(final RetentionLeases retentionLeases) {
return retentionLeases.leases;
public static Map<String, RetentionLease> toMapExcludingPeerRecoveryRetentionLeases(final RetentionLeases retentionLeases) {
return retentionLeases.leases.values().stream()
.filter(l -> ReplicationTracker.PEER_RECOVERY_RETENTION_LEASE_SOURCE.equals(l.source()) == false)
.collect(Collectors.toMap(RetentionLease::id, Function.identity(),
(o1, o2) -> {
throw new AssertionError("unexpectedly merging " + o1 + " and " + o2);
},
LinkedHashMap::new));
}

}
14 changes: 14 additions & 0 deletions server/src/main/java/org/elasticsearch/index/shard/IndexShard.java
Original file line number Diff line number Diff line change
Expand Up @@ -2415,6 +2415,20 @@ public boolean isRelocatedPrimary() {
return replicationTracker.isRelocated();
}

public void addPeerRecoveryRetentionLease(String nodeId, long globalCheckpoint, ActionListener<ReplicationResponse> listener) {
assert assertPrimaryMode();
replicationTracker.addPeerRecoveryRetentionLease(nodeId, globalCheckpoint, listener);
}

/**
* Test-only method to advance the primary's peer-recovery retention lease so that operations up to the global checkpoint can be
* discarded. TODO Remove this when retention leases are advanced by other mechanisms.
*/
public void advancePrimaryPeerRecoveryRetentionLeaseToGlobalCheckpoint() {
assert assertPrimaryMode();
replicationTracker.renewPeerRecoveryRetentionLease(routingEntry(), getGlobalCheckpoint());
}

class ShardEventListener implements Engine.EventListener {
private final CopyOnWriteArrayList<Consumer<ShardFailure>> delegates = new CopyOnWriteArrayList<>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
import org.elasticsearch.ExceptionsHelper;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.StepListener;
import org.elasticsearch.action.support.replication.ReplicationResponse;
import org.elasticsearch.cluster.metadata.IndexMetaData;
import org.elasticsearch.cluster.routing.IndexShardRoutingTable;
import org.elasticsearch.cluster.routing.ShardRouting;
import org.elasticsearch.common.CheckedSupplier;
Expand All @@ -49,6 +51,7 @@
import org.elasticsearch.index.engine.Engine;
import org.elasticsearch.index.engine.RecoveryEngineException;
import org.elasticsearch.index.seqno.LocalCheckpointTracker;
import org.elasticsearch.index.seqno.RetentionLeaseAlreadyExistsException;
import org.elasticsearch.index.seqno.RetentionLeases;
import org.elasticsearch.index.seqno.SequenceNumbers;
import org.elasticsearch.index.shard.IndexShard;
Expand Down Expand Up @@ -188,10 +191,28 @@ public void recoverToTarget(ActionListener<RecoveryResponse> listener) {
}
assert startingSeqNo >= 0 : "startingSeqNo must be non negative. got: " + startingSeqNo;

final StepListener<ReplicationResponse> establishRetentionLeaseStep = new StepListener<>();
if (shard.indexSettings().isSoftDeleteEnabled()
&& shard.indexSettings().getIndexMetaData().getState() != IndexMetaData.State.CLOSE) {
runUnderPrimaryPermit(() -> {
try {
// blindly create the lease. TODO integrate this with the recovery process
shard.addPeerRecoveryRetentionLease(request.targetNode().getId(), startingSeqNo - 1, establishRetentionLeaseStep);
} catch (RetentionLeaseAlreadyExistsException e) {
logger.debug("peer-recovery retention lease already exists", e);
establishRetentionLeaseStep.onResponse(null);
}
}, shardId + " establishing retention lease for [" + request.targetAllocationId() + "]", shard, cancellableThreads, logger);
} else {
establishRetentionLeaseStep.onResponse(null);
}

final StepListener<TimeValue> prepareEngineStep = new StepListener<>();
// For a sequence based recovery, the target can keep its local translog
prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
establishRetentionLeaseStep.whenComplete(r -> {
// For a sequence based recovery, the target can keep its local translog
prepareTargetForTranslog(isSequenceNumberBasedRecovery == false,
shard.estimateNumberOfHistoryOperations("peer-recovery", startingSeqNo), prepareEngineStep);
}, onFailure);
final StepListener<SendSnapshotResult> sendSnapshotStep = new StepListener<>();
prepareEngineStep.whenComplete(prepareEngineTime -> {
/*
Expand Down
Loading

0 comments on commit 01ad532

Please sign in to comment.