Skip to content

Commit

Permalink
Fix: avoid creating log-id with unitialized matched.leader_id.
Browse files Browse the repository at this point in the history
When waiting for a newly added learner to become up to date,
it tries to compare last-log-id and the reported `matched` replication
state.
But the `matched` may have not yet receive any update and is
unitialized, in such case, it tries to create a temp LogId with
`leader_id(0, 0)`, which is illegal.

The fix is simple: do not use log-id. Just calculating replication lag by log index.

Add test to reproduce it: openraft/tests/membership/t99_issue_471_adding_learner_uses_uninit_leader_id.rs

- Fix: #471
  • Loading branch information
drmingdrmer committed Jul 26, 2022
1 parent db047b9 commit 59ddc98
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 27 deletions.
2 changes: 1 addition & 1 deletion openraft/src/core/raft_core.rs
Original file line number Diff line number Diff line change
Expand Up @@ -634,7 +634,7 @@ impl<C: RaftTypeConfig, N: RaftNetworkFactory<C>, S: RaftStorage<C>> RaftCore<C,
unreachable!("it has to be a leader!!!");
};

let distance = replication_lag(&matched, &last_log_id);
let distance = replication_lag(&matched.map(|x| x.index), &last_log_id.map(|x| x.index));

if distance <= self.config.replication_lag_threshold {
continue;
Expand Down
33 changes: 9 additions & 24 deletions openraft/src/core/replication_state.rs
Original file line number Diff line number Diff line change
@@ -1,36 +1,21 @@
use crate::raft_types::LogIdOptionExt;
use crate::LogId;
use crate::NodeId;
use crate::raft_types::LogIndexOptionExt;

/// Calculate the distance between the matched log id on a replication target and local last log id
pub(crate) fn replication_lag<NID: NodeId>(matched: &Option<LogId<NID>>, last_log_id: &Option<LogId<NID>>) -> u64 {
last_log_id.next_index().saturating_sub(matched.next_index())
/// Calculate the distance between the matched log index on a replication target and local last log index
pub(crate) fn replication_lag(matched_log_index: &Option<u64>, last_log_index: &Option<u64>) -> u64 {
last_log_index.next_index().saturating_sub(matched_log_index.next_index())
}

#[cfg(test)]
mod test {
use crate::core::replication_state::replication_lag;
use crate::LeaderId;
use crate::LogId;

#[test]
fn test_replication_lag() -> anyhow::Result<()> {
let log_id = |term, node_id, index| LogId::<u64>::new(LeaderId::new(term, node_id), index);

assert_eq!(0, replication_lag::<u64>(&None, &None));
assert_eq!(4, replication_lag::<u64>(&None, &Some(log_id(1, 2, 3))));
assert_eq!(
1,
replication_lag::<u64>(&Some(log_id(1, 2, 2)), &Some(log_id(1, 2, 3)))
);
assert_eq!(
0,
replication_lag::<u64>(&Some(log_id(1, 2, 3)), &Some(log_id(1, 2, 3)))
);
assert_eq!(
0,
replication_lag::<u64>(&Some(log_id(1, 2, 4)), &Some(log_id(1, 2, 3)))
);
assert_eq!(0, replication_lag(&None, &None));
assert_eq!(4, replication_lag(&None, &Some(3)));
assert_eq!(1, replication_lag(&Some(2), &Some(3)));
assert_eq!(0, replication_lag(&Some(3), &Some(3)));
assert_eq!(0, replication_lag(&Some(4), &Some(3)));
Ok(())
}
}
3 changes: 1 addition & 2 deletions openraft/src/raft.rs
Original file line number Diff line number Diff line change
Expand Up @@ -436,8 +436,7 @@ impl<C: RaftTypeConfig, N: RaftNetworkFactory<C>, S: RaftStorage<C>> Raft<C, N,

let matched = target_metrics.matched();

let last_log_id = LogId::new(matched.leader_id, metrics.last_log_index.unwrap_or_default());
let distance = replication_lag(&Some(matched), &Some(last_log_id));
let distance = replication_lag(&Some(matched.index), &metrics.last_log_index);

if distance <= self.inner.config.replication_lag_threshold {
// replication became up to date.
Expand Down
1 change: 1 addition & 0 deletions openraft/tests/membership/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,4 +19,5 @@ mod t30_commit_joint_config;
mod t30_step_down;
mod t40_removed_follower;
mod t45_remove_unreachable_follower;
mod t99_issue_471_adding_learner_uses_uninit_leader_id;
mod t99_new_leader_auto_commit_uniform_config;
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
use std::sync::Arc;

use anyhow::Result;
use maplit::btreeset;
use openraft::Config;

use crate::fixtures::init_default_ut_tracing;
use crate::fixtures::RaftRouter;

/// When adding learner and waiting for the learner to become up to date,
/// it should not try to use `matched.leader_id` which may be uninitialized, i.e., `(0,0)`.
/// https://github.com/datafuselabs/openraft/issues/471
///
/// - Brings up 1 leader.
/// - Add learner at once.
/// - It should not panic.
#[async_entry::test(worker_threads = 8, init = "init_default_ut_tracing()", tracing_span = "debug")]
async fn adding_learner_do_not_use_matched_leader_id() -> Result<()> {
let config = Arc::new(
Config {
// Replicate log one by one, to trigger a state report with matched=(0,0,0), which is the first log id.
max_payload_entries: 1,
..Default::default()
}
.validate()?,
);
let mut router = RaftRouter::new(config.clone());

router.new_nodes_from_single(btreeset! {0}, btreeset! {}).await?;

tracing::info!("--- feed 2 log to make replication busy");
{
router.client_request_many(0, "foo", 2).await?;
}

// Delay replication.
router.network_send_delay(100);

tracing::info!("--- add learner: node-1");
{
router.new_raft_node(1);
router.add_learner(0, 1).await?;
}

Ok(())
}

0 comments on commit 59ddc98

Please sign in to comment.