Skip to content

Commit

Permalink
ovsdb: raft: Support pre-vote mechanism to deal with disruptive server.
Browse files Browse the repository at this point in the history
When a server becomes unstable due to system overloading or intermittent
partitioning, it may miss some heartbeats and then starts election with
a new term, which would disrupt the otherwise healthy cluster formed by
the rest of the healthy nodes.  Such situation may exist for a long time
until the "flapping" server is shutdown or recovered completely, which
can severely impact the availability of the cluster. The pre-vote
mechanism introduced in the raft paper section 9.6 can prevent such
problems. This patch implements the pre-vote mechanism.

Note: during the upgrade, since the old version doesn't recognize the
new optional field in the vote rpc (and the ovsdb_parse_finish validates
that all fields in the jsonrpc are parsed), an error log may be noticed
on old nodes if an upgraded node happens to become candidate first and
vote for itself, and the vote request will be discarded. If this happens
before enough nodes complete the upgrade, the vote from the upgraded
node may not reach the quorum. This results in re-election, and any old
nodes should be able to vote and get elected as leader. So, in unlucky
cases there can be more leader elections happening during the upgrade.

Reviewed-by: Simon Horman <[email protected]>
Signed-off-by: Han Zhou <[email protected]>
Signed-off-by: Ilya Maximets <[email protected]>
  • Loading branch information
hzhou8 authored and igsilya committed Aug 31, 2023
1 parent bb61931 commit 607251a
Show file tree
Hide file tree
Showing 5 changed files with 136 additions and 26 deletions.
6 changes: 6 additions & 0 deletions NEWS
Original file line number Diff line number Diff line change
@@ -1,5 +1,11 @@
Post-v3.2.0
--------------------
- OVSDB:
* Support pre-vote mechanism in RAFT that protects the cluster against
disruptive servers (section 9.6 of the original RAFT paper). Upgrading
from older version is supported but it may trigger more leader elections
during the process, and error logs complaining unrecognized fields may
be observed on old nodes.


v3.2.0 - 17 Aug 2023
Expand Down
22 changes: 20 additions & 2 deletions ovsdb/raft-rpc.c
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,10 @@ raft_vote_request_to_jsonrpc(const struct raft_vote_request *rq,
json_object_put(args, "leadership_transfer",
json_boolean_create(true));
}
if (rq->is_prevote) {
json_object_put(args, "is_prevote",
json_boolean_create(true));
}
}

static void
Expand All @@ -294,6 +298,8 @@ raft_vote_request_from_jsonrpc(struct ovsdb_parser *p,
rq->last_log_term = raft_parse_required_uint64(p, "last_log_term");
rq->leadership_transfer
= raft_parse_optional_boolean(p, "leadership_transfer") == 1;
rq->is_prevote
= raft_parse_optional_boolean(p, "is_prevote") == 1;
}

static void
Expand All @@ -305,6 +311,9 @@ raft_format_vote_request(const struct raft_vote_request *rq, struct ds *s)
if (rq->leadership_transfer) {
ds_put_cstr(s, " leadership_transfer=true");
}
if (rq->is_prevote) {
ds_put_cstr(s, " is_prevote=true");
}
}

/* raft_vote_reply. */
Expand All @@ -326,6 +335,9 @@ raft_vote_reply_to_jsonrpc(const struct raft_vote_reply *rpy,
{
raft_put_uint64(args, "term", rpy->term);
json_object_put_format(args, "vote", UUID_FMT, UUID_ARGS(&rpy->vote));
if (rpy->is_prevote) {
json_object_put(args, "is_prevote", json_boolean_create(true));
}
}

static void
Expand All @@ -334,13 +346,17 @@ raft_vote_reply_from_jsonrpc(struct ovsdb_parser *p,
{
rpy->term = raft_parse_required_uint64(p, "term");
rpy->vote = raft_parse_required_uuid(p, "vote");
rpy->is_prevote = raft_parse_optional_boolean(p, "is_prevote") == 1;
}

static void
raft_format_vote_reply(const struct raft_vote_reply *rpy, struct ds *s)
{
ds_put_format(s, " term=%"PRIu64, rpy->term);
ds_put_format(s, " vote="SID_FMT, SID_ARGS(&rpy->vote));
if (rpy->is_prevote) {
ds_put_cstr(s, " is_prevote=true");
}
}

/* raft_add_server_request */
Expand Down Expand Up @@ -1007,8 +1023,10 @@ raft_rpc_get_vote(const union raft_rpc *rpc)
case RAFT_RPC_BECOME_LEADER:
return NULL;

case RAFT_RPC_VOTE_REPLY:
return &raft_vote_reply_cast(rpc)->vote;
case RAFT_RPC_VOTE_REPLY: {
const struct raft_vote_reply *rpy = raft_vote_reply_cast(rpc);
return rpy->is_prevote ? NULL : &rpy->vote;
}

default:
OVS_NOT_REACHED();
Expand Down
3 changes: 3 additions & 0 deletions ovsdb/raft-rpc.h
Original file line number Diff line number Diff line change
Expand Up @@ -125,12 +125,15 @@ struct raft_vote_request {
uint64_t last_log_index; /* Index of candidate's last log entry. */
uint64_t last_log_term; /* Term of candidate's last log entry. */
bool leadership_transfer; /* True to override minimum election timeout. */
bool is_prevote; /* True: pre-vote; False: real vote (default). */
};

struct raft_vote_reply {
struct raft_rpc_common common;
uint64_t term; /* Current term, for candidate to update itself. */
struct uuid vote; /* Server ID of vote. */
bool is_prevote; /* Copy of the is_prevote from the request,
* primarily for validation. */
};

struct raft_add_server_request {
Expand Down
88 changes: 64 additions & 24 deletions ovsdb/raft.c
Original file line number Diff line number Diff line change
Expand Up @@ -305,6 +305,11 @@ struct raft {

/* Candidates only. Reinitialized at start of election. */
int n_votes; /* Number of votes for me. */
bool prevote_passed; /* Indicates if it passed pre-vote phase.
* Pre-vote mechanism is introduced in raft
* paper section 9.6. We implement it as a
* sub-state of candidate to minimize the
* change and keep backward compatibility. */

/* Followers and candidates only. */
bool candidate_retrying; /* The earlier election timed-out and we are
Expand Down Expand Up @@ -372,7 +377,8 @@ static void raft_become_follower(struct raft *);
static void raft_reset_election_timer(struct raft *);
static void raft_reset_ping_timer(struct raft *);
static void raft_send_heartbeats(struct raft *);
static void raft_start_election(struct raft *, bool leadership_transfer);
static void raft_start_election(struct raft *, bool is_prevote,
bool leadership_transfer);
static bool raft_truncate(struct raft *, uint64_t new_end);
static void raft_get_servers_from_log(struct raft *, enum vlog_level);
static void raft_get_election_timer_from_log(struct raft *);
Expand Down Expand Up @@ -1069,7 +1075,8 @@ raft_open(struct ovsdb_log *log, struct raft **raftp)
/* If there's only one server, start an election right away so that the
* cluster bootstraps quickly. */
if (hmap_count(&raft->servers) == 1) {
raft_start_election(raft, false);
/* No pre-vote needed since we are the only one. */
raft_start_election(raft, false, false);
}
} else {
raft->join_timeout = time_msec() + 1000;
Expand Down Expand Up @@ -1360,7 +1367,7 @@ void
raft_take_leadership(struct raft *raft)
{
if (raft->role != RAFT_LEADER) {
raft_start_election(raft, true);
raft_start_election(raft, false, true);
}
}

Expand Down Expand Up @@ -1766,12 +1773,12 @@ raft_set_term(struct raft *raft, uint64_t term, const struct uuid *vote)
return true;
}

static void
static bool
raft_accept_vote(struct raft *raft, struct raft_server *s,
const struct uuid *vote)
{
if (uuid_equals(&s->vote, vote)) {
return;
return false;
}
if (!uuid_is_zero(&s->vote)) {
static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 1);
Expand All @@ -1785,13 +1792,18 @@ raft_accept_vote(struct raft *raft, struct raft_server *s,
s->vote = *vote;
if (uuid_equals(vote, &raft->sid)
&& ++raft->n_votes > hmap_count(&raft->servers) / 2) {
raft_become_leader(raft);
return true;
}
return false;
}

static void
raft_start_election(struct raft *raft, bool leadership_transfer)
raft_start_election(struct raft *raft, bool is_prevote,
bool leadership_transfer)
{
/* Leadership transfer doesn't use pre-vote. */
ovs_assert(!is_prevote || !leadership_transfer);

if (raft->leaving) {
return;
}
Expand All @@ -1801,34 +1813,41 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
return;
}

if (!raft_set_term(raft, raft->term + 1, &raft->sid)) {
if (!is_prevote && !raft_set_term(raft, raft->term + 1, &raft->sid)) {
return;
}

ovs_assert(raft->role != RAFT_LEADER);

raft->leader_sid = UUID_ZERO;
raft->role = RAFT_CANDIDATE;
/* If there was no leader elected since last election, we know we are
* retrying now. */
raft->candidate_retrying = !raft->had_leader;
raft->had_leader = false;
raft->prevote_passed = !is_prevote;

if (is_prevote || leadership_transfer) {
/* If there was no leader elected since last election, we know we are
* retrying now. */
raft->candidate_retrying = !raft->had_leader;
raft->had_leader = false;

raft->election_start = time_msec();
raft->election_won = 0;
}

raft->n_votes = 0;

raft->election_start = time_msec();
raft->election_won = 0;
raft->leadership_transfer = leadership_transfer;

static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
if (!VLOG_DROP_INFO(&rl)) {
long long int now = time_msec();
char *comment = is_prevote ? "pre-vote" : "vote";
if (now >= raft->election_timeout) {
VLOG_INFO("term %"PRIu64": %lld ms timeout expired, "
"starting election",
raft->term, now - raft->election_base);
"starting election (%s)",
raft->term, now - raft->election_base, comment);
} else {
VLOG_INFO("term %"PRIu64": starting election", raft->term);
VLOG_INFO("term %"PRIu64": starting election (%s)",
raft->term, comment);
}
}
raft_reset_election_timer(raft);
Expand All @@ -1853,6 +1872,7 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
? raft->entries[raft->log_end - raft->log_start - 1].term
: raft->snap.term),
.leadership_transfer = leadership_transfer,
.is_prevote = is_prevote,
},
};
if (failure_test != FT_DONT_SEND_VOTE_REQUEST) {
Expand All @@ -1861,7 +1881,13 @@ raft_start_election(struct raft *raft, bool leadership_transfer)
}

/* Vote for ourselves. */
raft_accept_vote(raft, me, &raft->sid);
if (raft_accept_vote(raft, me, &raft->sid)) {
/* We just started vote, so it shouldn't be accepted yet unless this is
* a one-node cluster. In such case we don't do pre-vote, and become
* leader immediately. */
ovs_assert(!is_prevote);
raft_become_leader(raft);
}
}

static void
Expand Down Expand Up @@ -2041,10 +2067,10 @@ raft_run(struct raft *raft)
raft_reset_election_timer(raft);
} else {
raft_become_follower(raft);
raft_start_election(raft, false);
raft_start_election(raft, true, false);
}
} else {
raft_start_election(raft, false);
raft_start_election(raft, true, false);
}

}
Expand Down Expand Up @@ -3673,6 +3699,10 @@ raft_handle_vote_request__(struct raft *raft,
return false;
}

if (rq->is_prevote) {
return true;
}

/* Record a vote for the peer. */
if (!raft_set_term(raft, raft->term, &rq->common.sid)) {
return false;
Expand All @@ -3685,7 +3715,7 @@ raft_handle_vote_request__(struct raft *raft,

static void
raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
const struct uuid *vote)
const struct uuid *vote, bool is_prevote)
{
union raft_rpc rpy = {
.vote_reply = {
Expand All @@ -3695,6 +3725,7 @@ raft_send_vote_reply(struct raft *raft, const struct uuid *dst,
},
.term = raft->term,
.vote = *vote,
.is_prevote = is_prevote,
},
};
raft_send(raft, &rpy);
Expand All @@ -3705,7 +3736,9 @@ raft_handle_vote_request(struct raft *raft,
const struct raft_vote_request *rq)
{
if (raft_handle_vote_request__(raft, rq)) {
raft_send_vote_reply(raft, &rq->common.sid, &raft->vote);
raft_send_vote_reply(raft, &rq->common.sid,
rq->is_prevote ? &rq->common.sid : &raft->vote,
rq->is_prevote);
}
}

Expand All @@ -3723,7 +3756,14 @@ raft_handle_vote_reply(struct raft *raft,

struct raft_server *s = raft_find_peer(raft, &rpy->common.sid);
if (s) {
raft_accept_vote(raft, s, &rpy->vote);
if (raft_accept_vote(raft, s, &rpy->vote)) {
if (raft->prevote_passed) {
raft_become_leader(raft);
} else {
/* Start the real election. */
raft_start_election(raft, false, false);
}
}
}
}

Expand Down Expand Up @@ -4357,7 +4397,7 @@ raft_handle_become_leader(struct raft *raft,
VLOG_INFO("received leadership transfer from %s in term %"PRIu64,
raft_get_nickname(raft, &rq->common.sid, buf, sizeof buf),
rq->term);
raft_start_election(raft, true);
raft_start_election(raft, false, true);
}
}

Expand Down
43 changes: 43 additions & 0 deletions tests/ovsdb-cluster.at
Original file line number Diff line number Diff line change
Expand Up @@ -715,6 +715,49 @@ done

AT_CLEANUP


AT_SETUP([OVSDB cluster - disruptive server])
AT_KEYWORDS([ovsdb server negative unix cluster disruptive])

n=3
AT_CHECK([ovsdb-tool '-vPATTERN:console:%c|%p|%m' create-cluster \
s1.db $abs_srcdir/idltest.ovsschema unix:s1.raft], [0], [], [stderr])
cid=$(ovsdb-tool db-cid s1.db)
schema_name=$(ovsdb-tool schema-name $abs_srcdir/idltest.ovsschema)
for i in $(seq 2 $n); do
AT_CHECK([ovsdb-tool join-cluster s$i.db $schema_name unix:s$i.raft unix:s1.raft])
done

on_exit 'kill $(cat *.pid)'
for i in $(seq $n); do
AT_CHECK([ovsdb-server -v -vconsole:off -vsyslog:off --detach --no-chdir \
--log-file=s$i.log --pidfile=s$i.pid --unixctl=s$i \
--remote=punix:s$i.ovsdb s$i.db])
done
for i in $(seq $n); do
AT_CHECK([ovsdb_client_wait unix:s$i.ovsdb $schema_name connected])
done

# An unstable follower shouldn't disrupt the healthy cluster - shouldn't
# trigger term change.
AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test stop-raft-rpc], [0], [ignore])
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep -q "Role: candidate"])
AT_CHECK([ovs-appctl -t $(pwd)/s2 cluster/failure-test clear], [0], [ignore])

# Should step back to follower.
OVS_WAIT_UNTIL([ovs-appctl -t $(pwd)/s2 cluster/status $schema_name | grep -q "Role: follower"])

# No term change.
for i in $(seq $n); do
AT_CHECK([ovs-appctl -t $(pwd)/s$i cluster/status $schema_name | grep -q "Term: 1"])
done

for i in $(seq $n); do
OVS_APP_EXIT_AND_WAIT_BY_TARGET([$(pwd)/s$i], [s$i.pid])
done

AT_CLEANUP


AT_BANNER([OVSDB - cluster tests])

Expand Down

0 comments on commit 607251a

Please sign in to comment.