From 8da5acf6b3140e244b96a4ad15ce271492ba4449 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lo=C3=AFc=20Hoguin?= Date: Thu, 23 Mar 2023 14:39:56 +0100 Subject: [PATCH] CMQ: Propagate requeue on promotion to master When a node goes down a slave gets promoted to master. When this happens the new master requeues all messages pending acks. If x-max-length is defined and the queue length after requeue goes over the limit, the new master will start dropping messages immediately. This causes issues for other slaves because they do not requeue their messages automatically, instead they wait for the new master to tell them what to do. This eventually triggers an assert because the queue length are unexpectedly out of sync when the first drop message is propagated to the cluster. This issue must have been present for a very long time, probably since e35260836fc68c2ada9ea2ee8428e0e256d67b0e. The fix is to make the new master propagate the requeues when it gets promoted. To reproduce, a cluster must be started, ha-mode: all set via policies, and perf-test started with the following arguments: perf-test -x 1 -y 1 -r 10000 -R 50 -c 500 -s 1000 -u v2 \ -qa x-queue-version=2,x-max-length=10000 -ad false -f persistent Wait a little bit for the queue to have 10000+ ready messages (not total, total will be more) and then kill the master node (usually the first pid that 'ps -aux | grep beam' gives you). The crashes will be logged in the slave node that was not promoted (node 2 in my case). --- deps/rabbit/src/rabbit_mirror_queue_master.erl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/deps/rabbit/src/rabbit_mirror_queue_master.erl b/deps/rabbit/src/rabbit_mirror_queue_master.erl index 98f5d0cef840..e6e0efc91d3e 100644 --- a/deps/rabbit/src/rabbit_mirror_queue_master.erl +++ b/deps/rabbit/src/rabbit_mirror_queue_master.erl @@ -513,7 +513,8 @@ zip_msgs_and_acks(Msgs, AckTags, Accumulator, master_state(). promote_backing_queue_state(QName, CPid, BQ, BQS, GM, AckTags, Seen, KS) -> - {_MsgIds, BQS1} = BQ:requeue(AckTags, BQS), + {MsgIds, BQS1} = BQ:requeue(AckTags, BQS), + ok = gm:broadcast(GM, {requeue, MsgIds}), Len = BQ:len(BQS1), Depth = BQ:depth(BQS1), true = Len == Depth, %% ASSERTION: everything must have been requeued