From a31bee54707900cdbb5b7f098c18ed780dcb1fdd Mon Sep 17 00:00:00 2001 From: mwtian <81660174+mwtian@users.noreply.github.com> Date: Tue, 10 Sep 2024 09:09:22 -0700 Subject: [PATCH] [Consensus] count missing ancestors and blocks per authority (#19293) ## Description Count missing ancestors and new missing blocks per authority. This may help identify bad performing validators faster. ## Test plan CI PT --- ## Release notes Check each box that your changes affect. If none of the boxes relate to your changes, release notes aren't required. For each box you select, include information after the relevant heading that describes the impact of your changes that a user might notice and any actions they must take to implement updates. - [ ] Protocol: - [ ] Nodes (Validators and Full nodes): - [ ] Indexer: - [ ] JSON-RPC: - [ ] GraphQL: - [ ] CLI: - [ ] Rust SDK: - [ ] REST API: --- consensus/core/src/block_manager.rs | 17 ++++++++++++++++- consensus/core/src/metrics.rs | 18 ++++++++++++++++-- 2 files changed, 32 insertions(+), 3 deletions(-) diff --git a/consensus/core/src/block_manager.rs b/consensus/core/src/block_manager.rs index d381f3c9da121..ba4a6e740e063 100644 --- a/consensus/core/src/block_manager.rs +++ b/consensus/core/src/block_manager.rs @@ -229,11 +229,26 @@ impl BlockManager { .or_default() .insert(block_ref); + let ancestor_hostname = &self.context.committee.authority(ancestor.author).hostname; + self.context + .metrics + .node_metrics + .block_manager_missing_ancestors_by_authority + .with_label_values(&[ancestor_hostname]) + .inc(); + // Add the ancestor to the missing blocks set only if it doesn't already exist in the suspended blocks - meaning // that we already have its payload. if !self.suspended_blocks.contains_key(ancestor) { - self.missing_blocks.insert(*ancestor); ancestors_to_fetch.insert(*ancestor); + if self.missing_blocks.insert(*ancestor) { + self.context + .metrics + .node_metrics + .block_manager_missing_blocks_by_authority + .with_label_values(&[ancestor_hostname]) + .inc(); + } } } } diff --git a/consensus/core/src/metrics.rs b/consensus/core/src/metrics.rs index 9cef48260924c..eba760582cec3 100644 --- a/consensus/core/src/metrics.rs +++ b/consensus/core/src/metrics.rs @@ -152,6 +152,8 @@ pub(crate) struct NodeMetrics { pub(crate) block_manager_suspended_blocks: IntGauge, pub(crate) block_manager_missing_ancestors: IntGauge, pub(crate) block_manager_missing_blocks: IntGauge, + pub(crate) block_manager_missing_blocks_by_authority: IntCounterVec, + pub(crate) block_manager_missing_ancestors_by_authority: IntCounterVec, pub(crate) threshold_clock_round: IntGauge, pub(crate) subscriber_connection_attempts: IntCounterVec, pub(crate) subscriber_connections: IntGaugeVec, @@ -272,8 +274,8 @@ impl NodeMetrics { ).unwrap(), highest_accepted_authority_round: register_int_gauge_vec_with_registry!( "highest_accepted_authority_round", - "The highest round where a block has been accepted by author. Resets on restart.", - &["author"], + "The highest round where a block has been accepted per authority. Resets on restart.", + &["authority"], registry, ).unwrap(), highest_accepted_round: register_int_gauge_with_registry!( @@ -488,6 +490,18 @@ impl NodeMetrics { "The number of blocks missing content tracked in the block manager", registry, ).unwrap(), + block_manager_missing_blocks_by_authority: register_int_counter_vec_with_registry!( + "block_manager_missing_blocks_by_authority", + "The number of new missing blocks by block authority", + &["authority"], + registry, + ).unwrap(), + block_manager_missing_ancestors_by_authority: register_int_counter_vec_with_registry!( + "block_manager_missing_ancestors_by_authority", + "The number of missing ancestors by ancestor authority across received blocks", + &["authority"], + registry, + ).unwrap(), threshold_clock_round: register_int_gauge_with_registry!( "threshold_clock_round", "The current threshold clock round. We only advance to a new round when a quorum of parents have been synced.",