Skip to content

Commit

Permalink
[raft] fix flaky tests (#7895)
Browse files Browse the repository at this point in the history
1. AddReplica can fail to promote to voter in some rare cases with
request
rejected. When one attempt of `SyncRequestAddReplica` went through in
raft
system, but raft returned a temp "system is busy" error; the next
attempt will
fail with "request rejected". Therefore, when we receive request
rejected error,
instead of returning error directly, we can check the membership to see
if the
replica has been promoted to voter.

2. getMembership in DownReplicate can fail with shard not found between
getting
the store with range lease and getting the membership, the replica was
removed.
In this case, the test should try again to find another store with range
lease
and see the number of replicas on that store.

3. In DownReplicate, sometimes when we check the pebble to see if the
keys are
deleted, the RemoveData hasn't been finished yet. This can happen
because
when we remove replica, we remove it from range descriptor first, then
from
raft; and remove the data in the end. To make the test less flaky, a for
loop
   is added.
  • Loading branch information
luluz66 authored Nov 14, 2024
1 parent a09f479 commit de4208a
Show file tree
Hide file tree
Showing 2 changed files with 73 additions and 26 deletions.
24 changes: 22 additions & 2 deletions enterprise/server/raft/store/store.go
Original file line number Diff line number Diff line change
Expand Up @@ -2048,7 +2048,7 @@ func (s *Store) addNonVoting(ctx context.Context, rangeID uint64, newReplicaID u
return s.nodeHost.SyncRequestAddNonVoting(ctx, rangeID, newReplicaID, node.GetNhid(), configChangeID)
})
if err != nil {
return status.InternalErrorf("nodeHost.SyncRequestAddNonVoting failed: %s", err)
return status.InternalErrorf("nodeHost.SyncRequestAddNonVoting failed (configChangeID=%d): %s", configChangeID, err)
}
return nil
}
Expand Down Expand Up @@ -2089,11 +2089,31 @@ func (s *Store) promoteToVoter(ctx context.Context, rd *rfpb.RangeDescriptor, ne
if err != nil {
return status.InternalErrorf("failed to get config change ID: %s", err)
}
s.log.Infof("promoter to voter, ccid=%d", configChangeID)
err = client.RunNodehostFn(ctx, func(ctx context.Context) error {
log.Infof("sync request add replica")
return s.nodeHost.SyncRequestAddReplica(ctx, rd.GetRangeId(), newReplicaID, node.GetNhid(), configChangeID)
})
if err != nil {
return status.InternalErrorf("nodeHost.SyncRequestAddReplica failed: %s", err)
if err == dragonboat.ErrRejected {
// SyncRequestAddReplica can be retried by RunNodehostFn. In some
// rare cases, when it returns a temp dragonboat error such as system
// busy, the config change can actually be applied in the background.
// However, the next attempt of SyncRequestAddReplica can fail
// because the config change ID is equal to the last applied change
// id.
membership, membershipErr := s.getMembership(ctx, rd.GetRangeId())
if membershipErr != nil {
return status.InternalErrorf("nodeHost.SyncRequestAddReplica failed (configChangeID=%d): %s, and getMembership failed: %s", configChangeID, err, membershipErr)
}
for replicaID := range membership.Nodes {
if replicaID == newReplicaID {
// the replica has been promoted to the voter
return nil
}
}
}
return status.InternalErrorf("nodeHost.SyncRequestAddReplica failed (configChangeID=%d): %s", configChangeID, err)
}
return nil
}
Expand Down
75 changes: 51 additions & 24 deletions enterprise/server/raft/store/store_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package store_test
import (
"bytes"
"context"
"errors"
"slices"
"testing"
"time"
Expand Down Expand Up @@ -45,7 +46,13 @@ func getMembership(t *testing.T, ts *testutil.TestingStore, ctx context.Context,
}
return nil
})
require.NoError(t, err)

if err != nil {
if errors.Is(err, dragonboat.ErrShardNotFound) {
return []*rfpb.ReplicaDescriptor{}
}
}
require.NoError(t, err, "failed to get membership for range %d on store %q", rangeID, ts.NHID())

replicas := make([]*rfpb.ReplicaDescriptor, 0, len(membership.Nodes))
for replicaID := range membership.Nodes {
Expand Down Expand Up @@ -1008,6 +1015,14 @@ func TestDownReplicate(t *testing.T) {
if len(replicas) > 3 {
continue
}

if len(replicas) == 0 {
// It's possible that between the function call GetStoreWithRangeLease
// and getMembership, the replica is removed from the store s and the
// range lease is deleted. In this case, we want to continue the
// for loop.
continue
}
rd = s.GetRange(2)
if len(rd.GetReplicas()) == 3 {
for _, r := range rd.GetReplicas() {
Expand All @@ -1027,31 +1042,43 @@ func TestDownReplicate(t *testing.T) {
}
require.NotNil(t, removed)
db = removed.DB()
db.Flush()
iter2, err := db.NewIter(&pebble.IterOptions{
LowerBound: rd.GetStart(),
UpperBound: rd.GetEnd(),
})
require.NoError(t, err)
defer iter2.Close()
keysSeen = 0
for iter2.First(); iter2.Valid(); iter2.Next() {
keysSeen++
}
require.Zero(t, keysSeen)

localStart, localEnd := keys.Range(replica.LocalKeyPrefix(2, removedReplicaID))
iter3, err := db.NewIter(&pebble.IterOptions{
LowerBound: localStart,
UpperBound: localEnd,
})
require.NoError(t, err)
defer iter3.Close()
keysSeen = 0
for iter3.First(); iter3.Valid(); iter3.Next() {
keysSeen++
for i := 0; ; i++ {
db.Flush()
iter2, err := db.NewIter(&pebble.IterOptions{
LowerBound: rd.GetStart(),
UpperBound: rd.GetEnd(),
})
require.NoError(t, err)
keysSeen = 0
for iter2.First(); iter2.Valid(); iter2.Next() {
keysSeen++
}
iter2.Close()
if keysSeen > 0 {
continue
}

localStart, localEnd := keys.Range(replica.LocalKeyPrefix(2, removedReplicaID))
iter3, err := db.NewIter(&pebble.IterOptions{
LowerBound: localStart,
UpperBound: localEnd,
})
require.NoError(t, err)
localKeysSeen := 0
for iter3.First(); iter3.Valid(); iter3.Next() {
localKeysSeen++
}
iter3.Close()
if localKeysSeen == 0 {
break
}
if i >= 5 {
require.Zero(t, keysSeen, 0, "range is expected to be empty but have %d keys", keysSeen)
require.Zero(t, localKeysSeen, 0, "local range is expected to be empty but have %d keys", localKeysSeen)
}
time.Sleep(100 * time.Millisecond)
}
require.Zero(t, keysSeen, "local range is expected to be empty but have %d keys", keysSeen)
}

func TestReplaceDeadReplica(t *testing.T) {
Expand Down

0 comments on commit de4208a

Please sign in to comment.