diff --git a/guide/src/replication.md b/guide/src/replication.md index 284d17446..062f8ab31 100644 --- a/guide/src/replication.md +++ b/guide/src/replication.md @@ -33,9 +33,9 @@ R5 2 4 4 If log 5 is committed by R1, and log 3 is not removed, R5 in future could become a new leader and overrides log 5 on R3. -### Caveat: deleting all entries after `prev_log_id` get committed log lost +### Caveat: deleting all entries after `prev_log_id` will get committed log lost -One of the mistake is to delete all entries after `prev_log_id` when a matching `prev_log_id` is found, e.g.: +One of the mistakes is to delete all entries after `prev_log_id` when a matching `prev_log_id` is found, e.g.: ``` fn handle_append_entries(req) { if store.has(req.prev_log_id) { @@ -95,23 +95,39 @@ Similar to append-entry: - (1) If the logs contained in the snapshot matches logs that are stored on a Follower/Learner, nothing is done. -- (2) If the logs conflicts with the local logs, local conflicting logs will be - deleted. And effective membership has to be reverted to some previous - non-conflicting one. +- (2) If the logs conflicts with the local logs, **ALL** non-committed logs will be + deleted, because we do not know which logs conflict. + And effective membership has to be reverted to some previous non-conflicting one. -### Necessity to delete conflicting logs +### Delete conflicting logs -**The `(2)` mentioned above is not necessary to do to achieve correctness. -It is done only for clarity**. - -If the `last_applied`(`snapshot_meta.last_log_id`) conflict with the local log at `last_applied.index`, -It does **NOT** need to delete the conflicting logs. +If `snapshot_meta.last_log_id` conflicts with the local log, Because the node that has conflicting logs won't become a leader: If this node can become a leader, according to raft spec, it has to contain all committed logs. But the log entry at `last_applied.index` is not committed, thus it can never become a leader. -But deleting conflicting logs make the state cleaner. :) -This way method such as `get_initial_state()` does not need to deal with -conditions such as that `last_log_id` can be smaller than `last_applied`. +But, it could become a leader when more logs are received. +At this time, the logs after `snapshot_meta.last_log_id` will all be cleaned. +The logs before or equal `snapshot_meta.last_log_id` will not be cleaned. + +Then there is chance this node becomes leader and uses these log for replication. + +#### Delete all non-committed + +It just truncates **ALL** non-committed logs here, +because `snapshot_meta.last_log_id` is committed, if the local log id conflicts +with `snapshot_meta.last_log_id`, there must be a quorum that contains `snapshot_meta.last_log_id`. +Thus, it is **safe to remove all logs** on this node. + +But removing committed logs leads to some trouble with membership management. +Thus, we just remove logs since `committed+1`. + +#### Not safe to clean conflicting logs after installing snapshot + +It's not safe to remove the conflicting logs that are less than `snap_last_log_id` after installing +snapshot. + +If the node crashes, dirty logs may remain there. These logs may be forwarded to other nodes if this nodes +becomes a leader. diff --git a/openraft/src/engine/engine_impl.rs b/openraft/src/engine/engine_impl.rs index 6811d96f2..52d6c3bfd 100644 --- a/openraft/src/engine/engine_impl.rs +++ b/openraft/src/engine/engine_impl.rs @@ -760,17 +760,31 @@ where } // Do install: - // 1. Truncate conflicting logs. + // 1. Truncate all logs if conflict // Unlike normal append-entries RPC, if conflicting logs are found, it is not **necessary** to delete them. // But cleaning them make the assumption of incremental-log-id always hold, which makes it easier to debug. // See: [Snapshot-replication](https://datafuselabs.github.io/openraft/replication.html#snapshot-replication) + // + // Truncate all: + // + // It just truncate **ALL** logs here, because `snap_last_log_id` is committed, if the local log id conflicts + // with `snap_last_log_id`, there must be a quorum that contains `snap_last_log_id`. + // Thus it is safe to remove all logs on this node. + // + // The logs before `snap_last_log_id` may conflicts with the leader too. + // It's not safe to remove the conflicting logs that are less than `snap_last_log_id` after installing + // snapshot. + // + // If the node crashes, dirty logs may remain there. These logs may be forwarded to other nodes if this nodes + // becomes a leader. + // // 2. Install snapshot. - // 3. purge maybe conflicting logs upto snapshot.last_log_id. let local = self.state.get_log_id(snap_last_log_id.index); if let Some(local) = local { if local != snap_last_log_id { - self.truncate_logs(snap_last_log_id.index); + // Delete non-committed logs. + self.truncate_logs(self.state.committed.next_index()); } } @@ -783,8 +797,8 @@ where // leader to synchronize its snapshot data. self.push_command(Command::InstallSnapshot { snapshot_meta: meta }); - // A local log that is <= snap_last_log_id may conflict with the leader. - // It has to purge all of them to prevent these log form being replicated, when this node becomes leader. + // A local log that is <= snap_last_log_id can not conflict with the leader. + // But there will be a hole in the logs. Thus it's better remove all logs. self.purge_log(snap_last_log_id) } diff --git a/openraft/src/engine/install_snapshot_test.rs b/openraft/src/engine/install_snapshot_test.rs index b6f150474..f74e08f54 100644 --- a/openraft/src/engine/install_snapshot_test.rs +++ b/openraft/src/engine/install_snapshot_test.rs @@ -196,8 +196,29 @@ fn test_install_snapshot_not_conflict() -> anyhow::Result<()> { #[test] fn test_install_snapshot_conflict() -> anyhow::Result<()> { - // Snapshot will be installed and there are no conflicting logs. - let mut eng = eng(); + // Snapshot will be installed, all non-committed log will be deleted. + // And there should be no conflicting logs left. + let mut eng = { + let mut eng = Engine:: { + snapshot_meta: SnapshotMeta { + last_log_id: Some(log_id(2, 2)), + last_membership: EffectiveMembership::new(Some(log_id(1, 1)), m12()), + snapshot_id: "1-2-3-4".to_string(), + }, + ..Default::default() + }; + + eng.state.committed = Some(log_id(2, 3)); + eng.state.log_ids = LogIdList::new(vec![ + // + log_id(2, 2), + log_id(3, 5), + log_id(4, 6), + log_id(4, 8), + ]); + + eng + }; eng.install_snapshot(SnapshotMeta { last_log_id: Some(log_id(5, 6)), @@ -231,7 +252,7 @@ fn test_install_snapshot_conflict() -> anyhow::Result<()> { assert_eq!( vec![ - Command::DeleteConflictLog { since: log_id(4, 6) }, + Command::DeleteConflictLog { since: log_id(2, 4) }, Command::UpdateMembership { membership: Arc::new(EffectiveMembership::new(Some(log_id(1, 1)), m1234())) }, diff --git a/openraft/tests/snapshot/t43_snapshot_delete_conflict_logs.rs b/openraft/tests/snapshot/t43_snapshot_delete_conflict_logs.rs index 9cd67ef8d..16bb6cba4 100644 --- a/openraft/tests/snapshot/t43_snapshot_delete_conflict_logs.rs +++ b/openraft/tests/snapshot/t43_snapshot_delete_conflict_logs.rs @@ -113,7 +113,7 @@ async fn snapshot_delete_conflicting_logs() -> Result<()> { payload: EntryPayload::Membership(Membership::new(vec![btreeset! {4,5}], None)), }, ], - leader_commit: Some(LogId::new(LeaderId::new(0, 0), 0)), + leader_commit: Some(LogId::new(LeaderId::new(1, 0), 2)), }; router.new_client(1, &()).await?.send_append_entries(req).await?; diff --git a/rust-toolchain b/rust-toolchain index 880ae7ccd..044ac90d9 100644 --- a/rust-toolchain +++ b/rust-toolchain @@ -1 +1 @@ -nightly-2022-08-11 +nightly-2022-09-19 diff --git a/sledstore/src/test.rs b/sledstore/src/test.rs index 7943a5dae..88e1e5d8e 100644 --- a/sledstore/src/test.rs +++ b/sledstore/src/test.rs @@ -46,7 +46,7 @@ impl StoreBuilder> for SledBuilder { let test_res = t(store).await; if db_dir.exists() { - std::fs::remove_dir_all(&db_dir).expect("Could not clean up test directory"); + std::fs::remove_dir_all(db_dir).expect("Could not clean up test directory"); } test_res };