From 11591818eb06114faf8b446245b453109a55e248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Chuda=C5=9B?= <18039094+staffik@users.noreply.github.com> Date: Mon, 24 Jun 2024 16:57:41 +0200 Subject: [PATCH] Validator key hot swap (#11536) Issue: https://github.com/near/nearcore/issues/11264 This PR build on: * https://github.com/near/nearcore/pull/11372 * https://github.com/near/nearcore/pull/11400 and contains actual changes and test for validator key hot swap. ### Summary - Extend `UpdateableConfig` with validator key. - Update client's mutable validator key when we detect it changed in the updateable config. - Advertise our new validator key through `advertise_tier1_proxies()`. - Add integration test for the new behaviour: - We start with 2 validating nodes (`node0`, `node1`) and 1 non-validating node (`node2`). It is important that the non-validating node tracks all shards, because we do not know which shard it will track when we switch validator keys. - We copy validator key from `node0` to `node2`. - We stop `node0`, then we trigger validator key reload for `node2`. - Now `node2` is a validator, but it figures out as `node0` because it copied validator key from `node0`. - We wait for a couple of epochs and we require that both remaining nodes progress the chain. Both nodes should be synchronised after a few epochs. Test with: ``` cargo build -pneard --features test_features,rosetta_rpc && cargo build -pgenesis-populate -prestaked -pnear-test-contracts && python3 pytest/tests/sanity/validator_switch_key_quick.py ``` #### Extra changes: - Use `MutableValidatorSigner` alias instead of `MutableConfigValue>>` - Return `ConfigUpdaterResult` from config updater. - Remove (de)serialization derives for `UpdateableConfigs`. - --------- Co-authored-by: Your Name --- chain/chain/src/chain.rs | 7 +- chain/chunks/src/shards_manager_actor.rs | 13 ++- chain/client/src/client.rs | 26 +++-- chain/client/src/client_actor.rs | 18 +++- chain/client/src/config_updater.rs | 25 ++++- .../partial_witness/partial_witness_actor.rs | 6 +- chain/client/src/view_client_actor.rs | 6 +- chain/network/src/config.rs | 5 +- .../src/peer_manager/peer_manager_actor.rs | 8 ++ chain/network/src/types.rs | 5 + core/chain-configs/src/lib.rs | 2 +- core/chain-configs/src/updateable_config.rs | 3 + core/dyn-configs/src/lib.rs | 8 +- nearcore/src/config.rs | 17 ++-- nearcore/src/dyn_config.rs | 37 ++++++- nearcore/src/state_sync.rs | 7 +- nightly/pytest-sanity.txt | 2 + pytest/lib/cluster.py | 5 + .../sanity/validator_switch_key_quick.py | 98 +++++++++++++++++++ 19 files changed, 250 insertions(+), 48 deletions(-) create mode 100755 pytest/tests/sanity/validator_switch_key_quick.py diff --git a/chain/chain/src/chain.rs b/chain/chain/src/chain.rs index 469d68bc3d6..43b0c2e2742 100644 --- a/chain/chain/src/chain.rs +++ b/chain/chain/src/chain.rs @@ -39,7 +39,9 @@ use itertools::Itertools; use lru::LruCache; use near_async::futures::{AsyncComputationSpawner, AsyncComputationSpawnerExt}; use near_async::time::{Clock, Duration, Instant}; -use near_chain_configs::{MutableConfigValue, ReshardingConfig, ReshardingHandle}; +use near_chain_configs::{ + MutableConfigValue, MutableValidatorSigner, ReshardingConfig, ReshardingHandle, +}; #[cfg(feature = "new_epoch_sync")] use near_chain_primitives::error::epoch_sync::EpochSyncInfoError; use near_chain_primitives::error::{BlockKnownError, Error, LogTransientStorageError}; @@ -87,7 +89,6 @@ use near_primitives::unwrap_or_return; #[cfg(feature = "new_epoch_sync")] use near_primitives::utils::index_to_bytes; use near_primitives::utils::MaybeValidated; -use near_primitives::validator_signer::ValidatorSigner; use near_primitives::version::{ProtocolFeature, ProtocolVersion, PROTOCOL_VERSION}; use near_primitives::views::{ BlockStatusView, DroppedReason, ExecutionOutcomeWithIdView, ExecutionStatusView, @@ -404,7 +405,7 @@ impl Chain { chain_config: ChainConfig, snapshot_callbacks: Option, apply_chunks_spawner: Arc, - validator: MutableConfigValue>>, + validator: MutableValidatorSigner, ) -> Result { // Get runtime initial state and create genesis block out of it. let state_roots = get_genesis_state_roots(runtime_adapter.store())? diff --git a/chain/chunks/src/shards_manager_actor.rs b/chain/chunks/src/shards_manager_actor.rs index d0953df1160..60344330335 100644 --- a/chain/chunks/src/shards_manager_actor.rs +++ b/chain/chunks/src/shards_manager_actor.rs @@ -98,7 +98,7 @@ use near_chain::byzantine_assert; use near_chain::chunks_store::ReadOnlyChunksStore; use near_chain::near_chain_primitives::error::Error::DBNotFoundErr; use near_chain::types::EpochManagerAdapter; -use near_chain_configs::MutableConfigValue; +use near_chain_configs::MutableValidatorSigner; pub use near_chunks_primitives::Error; use near_epoch_manager::shard_tracker::ShardTracker; use near_network::shards_manager::ShardsManagerRequestFromNetwork; @@ -246,7 +246,7 @@ pub struct ShardsManagerActor { /// Contains validator info about this node. This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, store: ReadOnlyChunksStore, epoch_manager: Arc, @@ -296,7 +296,7 @@ pub fn start_shards_manager( shard_tracker: ShardTracker, network_adapter: Sender, client_adapter_for_shards_manager: Sender, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, store: Store, chunk_request_retry_period: Duration, ) -> (actix::Addr>, actix::ArbiterHandle) { @@ -334,7 +334,7 @@ pub fn start_shards_manager( impl ShardsManagerActor { pub fn new( clock: time::Clock, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, epoch_manager: Arc, shard_tracker: ShardTracker, network_adapter: Sender, @@ -2242,6 +2242,7 @@ mod test { use assert_matches::assert_matches; use near_async::messaging::IntoSender; use near_async::time::FakeClock; + use near_chain_configs::MutableConfigValue; use near_epoch_manager::shard_tracker::TrackedConfig; use near_epoch_manager::test_utils::setup_epoch_manager_with_block_and_chunk_producers; use near_network::test_utils::MockPeerManagerAdapter; @@ -2257,9 +2258,7 @@ mod test { use crate::logic::persist_chunk; use crate::test_utils::*; - fn mutable_validator_signer( - account_id: &AccountId, - ) -> MutableConfigValue>> { + fn mutable_validator_signer(account_id: &AccountId) -> MutableValidatorSigner { MutableConfigValue::new( Some(Arc::new(EmptyValidatorSigner::new(account_id.clone()))), "validator_signer", diff --git a/chain/client/src/client.rs b/chain/client/src/client.rs index 4d8bea7bd32..1c4d42c2636 100644 --- a/chain/client/src/client.rs +++ b/chain/client/src/client.rs @@ -40,7 +40,7 @@ use near_chain::{ DoomslugThresholdMode, Provenance, }; use near_chain_configs::{ - ClientConfig, LogSummaryStyle, MutableConfigValue, UpdateableClientConfig, + ClientConfig, LogSummaryStyle, MutableValidatorSigner, UpdateableClientConfig, }; use near_chunks::adapter::ShardsManagerRequestFromClient; use near_chunks::client::ShardedTransactionPool; @@ -150,7 +150,7 @@ pub struct Client { /// Signer for block producer (if present). This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - pub validator_signer: MutableConfigValue>>, + pub validator_signer: MutableValidatorSigner, /// Approvals for which we do not have the block yet pub pending_approvals: lru::LruCache>, @@ -202,12 +202,24 @@ pub struct Client { } impl Client { - pub(crate) fn update_client_config(&self, update_client_config: UpdateableClientConfig) { - self.config.expected_shutdown.update(update_client_config.expected_shutdown); - self.config.resharding_config.update(update_client_config.resharding_config); - self.config + pub(crate) fn update_client_config( + &self, + update_client_config: UpdateableClientConfig, + ) -> bool { + let mut is_updated = false; + is_updated |= self.config.expected_shutdown.update(update_client_config.expected_shutdown); + is_updated |= self.config.resharding_config.update(update_client_config.resharding_config); + is_updated |= self + .config .produce_chunk_add_transactions_time_limit .update(update_client_config.produce_chunk_add_transactions_time_limit); + is_updated + } + + /// Updates client's mutable validator signer. + /// It will update all validator signers that synchronize with it. + pub(crate) fn update_validator_signer(&self, signer: Arc) -> bool { + self.validator_signer.update(Some(signer)) } } @@ -253,7 +265,7 @@ impl Client { runtime_adapter: Arc, network_adapter: PeerManagerAdapter, shards_manager_adapter: Sender, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, enable_doomslug: bool, rng_seed: RngSeed, snapshot_callbacks: Option, diff --git a/chain/client/src/client_actor.rs b/chain/client/src/client_actor.rs index 49dad764fd2..9491c00d221 100644 --- a/chain/client/src/client_actor.rs +++ b/chain/client/src/client_actor.rs @@ -41,7 +41,7 @@ use near_chain::{ byzantine_assert, near_chain_primitives, Block, BlockHeader, BlockProcessingArtifact, ChainGenesis, Provenance, }; -use near_chain_configs::{ClientConfig, LogSummaryStyle, MutableConfigValue, ReshardingHandle}; +use near_chain_configs::{ClientConfig, LogSummaryStyle, MutableValidatorSigner, ReshardingHandle}; use near_chain_primitives::error::EpochErrorResultToChainError; use near_chunks::adapter::ShardsManagerRequestFromClient; use near_chunks::client::ShardsManagerResponse; @@ -134,7 +134,7 @@ pub fn start_client( state_sync_adapter: Arc>, network_adapter: PeerManagerAdapter, shards_manager_adapter: Sender, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, telemetry_sender: Sender, snapshot_callbacks: Option, sender: Option>, @@ -1166,9 +1166,17 @@ impl ClientActorInner { fn check_triggers(&mut self, ctx: &mut dyn DelayedActionRunner) -> Duration { let _span = tracing::debug_span!(target: "client", "check_triggers").entered(); if let Some(config_updater) = &mut self.config_updater { - config_updater.try_update(&|updateable_client_config| { - self.client.update_client_config(updateable_client_config) - }); + let update_result = config_updater.try_update( + &|updateable_client_config| { + self.client.update_client_config(updateable_client_config) + }, + &|validator_signer| self.client.update_validator_signer(validator_signer), + ); + if update_result.validator_signer_updated { + // Request PeerManager to advertise tier1 proxies. + // It is needed to advertise that our validator key changed. + self.network_adapter.send(PeerManagerMessageRequest::AdvertiseTier1Proxies); + } } // Check block height to trigger expected shutdown diff --git a/chain/client/src/config_updater.rs b/chain/client/src/config_updater.rs index bc33f76d370..441d1bd4970 100644 --- a/chain/client/src/config_updater.rs +++ b/chain/client/src/config_updater.rs @@ -1,5 +1,6 @@ use near_chain_configs::UpdateableClientConfig; use near_dyn_configs::{UpdateableConfigLoaderError, UpdateableConfigs}; +use near_primitives::validator_signer::ValidatorSigner; use std::sync::Arc; use tokio::sync::broadcast::Receiver; @@ -12,6 +13,14 @@ pub struct ConfigUpdater { updateable_configs_error: Option>, } +/// Return type of `ConfigUpdater::try_update()`. +/// Represents which values have been updated. +#[derive(Default)] +pub struct ConfigUpdaterResult { + pub client_config_updated: bool, + pub validator_signer_updated: bool, +} + impl ConfigUpdater { pub fn new( rx_config_update: Receiver>>, @@ -21,14 +30,25 @@ impl ConfigUpdater { /// Check if any of the configs were updated. /// If they did, the receiver (rx_config_update) will contain a clone of the new configs. - pub fn try_update(&mut self, update_client_config_fn: &dyn Fn(UpdateableClientConfig)) { + pub fn try_update( + &mut self, + update_client_config_fn: &dyn Fn(UpdateableClientConfig) -> bool, + update_validator_signer_fn: &dyn Fn(Arc) -> bool, + ) -> ConfigUpdaterResult { + let mut update_result = ConfigUpdaterResult::default(); while let Ok(maybe_updateable_configs) = self.rx_config_update.try_recv() { match maybe_updateable_configs { Ok(updateable_configs) => { if let Some(client_config) = updateable_configs.client_config { - update_client_config_fn(client_config); + update_result.client_config_updated |= + update_client_config_fn(client_config); tracing::info!(target: "config", "Updated ClientConfig"); } + if let Some(validator_signer) = updateable_configs.validator_signer { + update_result.validator_signer_updated |= + update_validator_signer_fn(validator_signer); + tracing::info!(target: "config", "Updated validator key"); + } self.updateable_configs_error = None; } Err(err) => { @@ -36,6 +56,7 @@ impl ConfigUpdater { } } } + update_result } /// Prints an error if it's present. diff --git a/chain/client/src/stateless_validation/partial_witness/partial_witness_actor.rs b/chain/client/src/stateless_validation/partial_witness/partial_witness_actor.rs index e784640268f..b5298bb7cf3 100644 --- a/chain/client/src/stateless_validation/partial_witness/partial_witness_actor.rs +++ b/chain/client/src/stateless_validation/partial_witness/partial_witness_actor.rs @@ -5,7 +5,7 @@ use near_async::messaging::{Actor, CanSend, Handler, Sender}; use near_async::time::Clock; use near_async::{MultiSend, MultiSendMessage, MultiSenderFrom}; use near_chain::Error; -use near_chain_configs::MutableConfigValue; +use near_chain_configs::MutableValidatorSigner; use near_epoch_manager::EpochManagerAdapter; use near_network::state_witness::{ ChunkStateWitnessAckMessage, PartialEncodedStateWitnessForwardMessage, @@ -36,7 +36,7 @@ pub struct PartialWitnessActor { /// Validator signer to sign the state witness. This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - my_signer: MutableConfigValue>>, + my_signer: MutableValidatorSigner, /// Epoch manager to get the set of chunk validators epoch_manager: Arc, /// Tracks the parts of the state witness sent from chunk producers to chunk validators. @@ -107,7 +107,7 @@ impl PartialWitnessActor { clock: Clock, network_adapter: PeerManagerAdapter, client_sender: ClientSenderForPartialWitness, - my_signer: MutableConfigValue>>, + my_signer: MutableValidatorSigner, epoch_manager: Arc, store: Store, ) -> Self { diff --git a/chain/client/src/view_client_actor.rs b/chain/client/src/view_client_actor.rs index 4793b550a42..fe79e1a541a 100644 --- a/chain/client/src/view_client_actor.rs +++ b/chain/client/src/view_client_actor.rs @@ -13,7 +13,7 @@ use near_chain::types::{RuntimeAdapter, Tip}; use near_chain::{ get_epoch_block_producers_view, Chain, ChainGenesis, ChainStoreAccess, DoomslugThresholdMode, }; -use near_chain_configs::{ClientConfig, MutableConfigValue, ProtocolConfigView}; +use near_chain_configs::{ClientConfig, MutableValidatorSigner, ProtocolConfigView}; use near_chain_primitives::error::EpochErrorResultToChainError; use near_client_primitives::types::{ Error, GetBlock, GetBlockError, GetBlockProof, GetBlockProofError, GetBlockProofResponse, @@ -99,7 +99,7 @@ pub struct ViewClientActorInner { /// Validator account (if present). This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - validator: MutableConfigValue>>, + validator: MutableValidatorSigner, chain: Chain, epoch_manager: Arc, shard_tracker: ShardTracker, @@ -128,7 +128,7 @@ impl ViewClientActorInner { pub fn spawn_actix_actor( clock: Clock, - validator: MutableConfigValue>>, + validator: MutableValidatorSigner, chain_genesis: ChainGenesis, epoch_manager: Arc, shard_tracker: ShardTracker, diff --git a/chain/network/src/config.rs b/chain/network/src/config.rs index a34e9cfdb94..45237cc09fc 100644 --- a/chain/network/src/config.rs +++ b/chain/network/src/config.rs @@ -10,6 +10,7 @@ use crate::types::ROUTED_MESSAGE_TTL; use anyhow::Context; use near_async::time; use near_chain_configs::MutableConfigValue; +use near_chain_configs::MutableValidatorSigner; use near_crypto::{KeyType, SecretKey}; use near_primitives::network::PeerId; use near_primitives::test_utils::create_test_signer; @@ -60,7 +61,7 @@ pub struct ValidatorConfig { /// Contains signer key for this node. This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - pub signer: MutableConfigValue>>, + pub signer: MutableValidatorSigner, pub proxies: ValidatorProxies, } @@ -241,7 +242,7 @@ impl NetworkConfig { pub fn new( cfg: crate::config_json::Config, node_key: SecretKey, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, archive: bool, ) -> anyhow::Result { if cfg.public_addrs.len() > MAX_PEER_ADDRS { diff --git a/chain/network/src/peer_manager/peer_manager_actor.rs b/chain/network/src/peer_manager/peer_manager_actor.rs index d9fb1f8965e..fed881aa6ef 100644 --- a/chain/network/src/peer_manager/peer_manager_actor.rs +++ b/chain/network/src/peer_manager/peer_manager_actor.rs @@ -1026,6 +1026,14 @@ impl PeerManagerActor { self.handle_msg_network_requests(msg, ctx), ) } + PeerManagerMessageRequest::AdvertiseTier1Proxies => { + let state = self.state.clone(); + let clock = self.clock.clone(); + ctx.spawn(wrap_future(async move { + state.tier1_advertise_proxies(&clock).await; + })); + PeerManagerMessageResponse::AdvertiseTier1Proxies + } PeerManagerMessageRequest::OutboundTcpConnect(stream) => { let peer_addr = stream.peer_addr; if let Err(err) = diff --git a/chain/network/src/types.rs b/chain/network/src/types.rs index befa5cf5248..add96f797b3 100644 --- a/chain/network/src/types.rs +++ b/chain/network/src/types.rs @@ -162,6 +162,10 @@ pub struct SetChainInfo(pub ChainInfo); #[rtype(result = "PeerManagerMessageResponse")] pub enum PeerManagerMessageRequest { NetworkRequests(NetworkRequests), + /// Request PeerManager to call `tier1_advertise_proxies()`. Used internally. + /// The effect would be accounts data known by this node broadcasted to other tier1 nodes. + /// That includes info about validator signer of this node. + AdvertiseTier1Proxies, /// Request PeerManager to connect to the given peer. /// Used in tests and internally by PeerManager. /// TODO: replace it with AsyncContext::spawn/run_later for internal use. @@ -193,6 +197,7 @@ impl PeerManagerMessageRequest { #[derive(actix::MessageResponse, Debug)] pub enum PeerManagerMessageResponse { NetworkResponses(NetworkResponses), + AdvertiseTier1Proxies, /// TEST-ONLY OutboundTcpConnect, FetchRoutingTable(RoutingTableInfo), diff --git a/core/chain-configs/src/lib.rs b/core/chain-configs/src/lib.rs index cdd8d4be355..4407a722517 100644 --- a/core/chain-configs/src/lib.rs +++ b/core/chain-configs/src/lib.rs @@ -30,7 +30,7 @@ pub use genesis_config::{ }; use near_primitives::types::{Balance, BlockHeightDelta, Gas, NumBlocks, NumSeats}; use num_rational::Rational32; -pub use updateable_config::{MutableConfigValue, UpdateableClientConfig}; +pub use updateable_config::{MutableConfigValue, MutableValidatorSigner, UpdateableClientConfig}; pub const GENESIS_CONFIG_FILENAME: &str = "genesis.json"; diff --git a/core/chain-configs/src/updateable_config.rs b/core/chain-configs/src/updateable_config.rs index ed0cb4cd8fa..283e7ddc64f 100644 --- a/core/chain-configs/src/updateable_config.rs +++ b/core/chain-configs/src/updateable_config.rs @@ -1,6 +1,7 @@ #[cfg(feature = "metrics")] use near_async::time::Clock; use near_primitives::types::BlockHeight; +use near_primitives::validator_signer::ValidatorSigner; use serde::{Deserialize, Serialize, Serializer}; use std::fmt::Debug; use std::sync::{Arc, Mutex}; @@ -108,3 +109,5 @@ pub struct UpdateableClientConfig { #[serde(with = "near_async::time::serde_opt_duration_as_std")] pub produce_chunk_add_transactions_time_limit: Option, } + +pub type MutableValidatorSigner = MutableConfigValue>>; diff --git a/core/dyn-configs/src/lib.rs b/core/dyn-configs/src/lib.rs index 17330e21676..f9d19ba13b7 100644 --- a/core/dyn-configs/src/lib.rs +++ b/core/dyn-configs/src/lib.rs @@ -3,20 +3,22 @@ use near_async::time::Clock; use near_chain_configs::UpdateableClientConfig; use near_o11y::log_config::LogConfig; -use serde::{Deserialize, Serialize}; +use near_primitives::validator_signer::ValidatorSigner; use std::path::PathBuf; use std::sync::Arc; use tokio::sync::broadcast::Sender; mod metrics; -#[derive(Serialize, Deserialize, Clone, Default)] +#[derive(Clone, Default)] /// Contains the latest state of configs which can be updated at runtime. pub struct UpdateableConfigs { /// Contents of the file LOG_CONFIG_FILENAME. pub log_config: Option, /// Contents of the `config.json` corresponding to the mutable fields of `ClientConfig`. pub client_config: Option, + /// Validator key hot loaded from file. + pub validator_signer: Option>, } /// Pushes the updates to listeners. @@ -35,6 +37,8 @@ pub enum UpdateableConfigLoaderError { OpenAndRead { file: PathBuf, err: std::io::Error }, #[error("Can't open or read the config file {file:?}: {err:?}")] ConfigFileError { file: PathBuf, err: anyhow::Error }, + #[error("Can't open or read the validator key file {file:?}: {err:?}")] + ValidatorKeyFileError { file: PathBuf, err: anyhow::Error }, #[error("One or multiple dynamic config files reload errors {0:?}")] Errors(Vec), #[error("No home dir set")] diff --git a/nearcore/src/config.rs b/nearcore/src/config.rs index e86f0fbfa4b..35dd59744d6 100644 --- a/nearcore/src/config.rs +++ b/nearcore/src/config.rs @@ -20,12 +20,13 @@ use near_chain_configs::{ default_tx_routing_height_horizon, default_view_client_threads, default_view_client_throttle_period, get_initial_supply, ChunkDistributionNetworkConfig, ClientConfig, GCConfig, Genesis, GenesisConfig, GenesisValidationMode, LogSummaryStyle, - MutableConfigValue, ReshardingConfig, StateSyncConfig, BLOCK_PRODUCER_KICKOUT_THRESHOLD, - CHUNK_PRODUCER_KICKOUT_THRESHOLD, CHUNK_VALIDATOR_ONLY_KICKOUT_THRESHOLD, - EXPECTED_EPOCH_LENGTH, FISHERMEN_THRESHOLD, GAS_PRICE_ADJUSTMENT_RATE, GENESIS_CONFIG_FILENAME, - INITIAL_GAS_LIMIT, MAX_INFLATION_RATE, MIN_BLOCK_PRODUCTION_DELAY, MIN_GAS_PRICE, NEAR_BASE, - NUM_BLOCKS_PER_YEAR, NUM_BLOCK_PRODUCER_SEATS, PROTOCOL_REWARD_RATE, - PROTOCOL_UPGRADE_STAKE_THRESHOLD, TRANSACTION_VALIDITY_PERIOD, + MutableConfigValue, MutableValidatorSigner, ReshardingConfig, StateSyncConfig, + BLOCK_PRODUCER_KICKOUT_THRESHOLD, CHUNK_PRODUCER_KICKOUT_THRESHOLD, + CHUNK_VALIDATOR_ONLY_KICKOUT_THRESHOLD, EXPECTED_EPOCH_LENGTH, FISHERMEN_THRESHOLD, + GAS_PRICE_ADJUSTMENT_RATE, GENESIS_CONFIG_FILENAME, INITIAL_GAS_LIMIT, MAX_INFLATION_RATE, + MIN_BLOCK_PRODUCTION_DELAY, MIN_GAS_PRICE, NEAR_BASE, NUM_BLOCKS_PER_YEAR, + NUM_BLOCK_PRODUCER_SEATS, PROTOCOL_REWARD_RATE, PROTOCOL_UPGRADE_STAKE_THRESHOLD, + TRANSACTION_VALIDITY_PERIOD, }; use near_config_utils::{ValidationError, ValidationErrors}; use near_crypto::{InMemorySigner, KeyFile, KeyType, PublicKey}; @@ -507,7 +508,7 @@ pub struct NearConfig { /// Contains validator key for this node. This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - pub validator_signer: MutableConfigValue>>, + pub validator_signer: MutableValidatorSigner, } impl NearConfig { @@ -515,7 +516,7 @@ impl NearConfig { config: Config, genesis: Genesis, network_key_pair: KeyFile, - validator_signer: MutableConfigValue>>, + validator_signer: MutableValidatorSigner, ) -> anyhow::Result { Ok(NearConfig { config: config.clone(), diff --git a/nearcore/src/dyn_config.rs b/nearcore/src/dyn_config.rs index 0daf5ed1a94..9a480d0c17a 100644 --- a/nearcore/src/dyn_config.rs +++ b/nearcore/src/dyn_config.rs @@ -2,8 +2,10 @@ use crate::config::Config; use near_chain_configs::UpdateableClientConfig; use near_dyn_configs::{UpdateableConfigLoaderError, UpdateableConfigs}; use near_o11y::log_config::LogConfig; +use near_primitives::validator_signer::ValidatorSigner; use serde::Deserialize; use std::path::{Path, PathBuf}; +use std::sync::Arc; pub const LOG_CONFIG_FILENAME: &str = "log_config.json"; @@ -31,9 +33,22 @@ pub fn read_updateable_configs( }; let updateable_client_config = config.as_ref().map(get_updateable_client_config); + let validator_signer = if let Some(config) = config { + read_validator_key(home_dir, &config).unwrap_or_else(|err| { + errs.push(err); + None + }) + } else { + None + }; + if errs.is_empty() { crate::metrics::CONFIG_CORRECT.set(1); - Ok(UpdateableConfigs { log_config, client_config: updateable_client_config }) + Ok(UpdateableConfigs { + log_config, + client_config: updateable_client_config, + validator_signer, + }) } else { tracing::warn!(target: "neard", "Dynamically updateable configs are not valid. Please fix this ASAP otherwise the node will be unable to restart: {:?}", &errs); crate::metrics::CONFIG_CORRECT.set(0); @@ -86,3 +101,23 @@ where }, } } + +fn read_validator_key( + home_dir: &Path, + config: &Config, +) -> Result>, UpdateableConfigLoaderError> { + let validator_file: PathBuf = home_dir.join(&config.validator_key_file); + match crate::config::load_validator_key(&validator_file) { + Ok(Some(validator_signer)) => { + tracing::info!(target: "neard", "Hot loading validator key {}.", validator_file.display()); + Ok(Some(validator_signer)) + } + Ok(None) => { + tracing::info!(target: "neard", "No validator key {}.", validator_file.display()); + Ok(None) + } + Err(err) => { + Err(UpdateableConfigLoaderError::ValidatorKeyFileError { file: validator_file, err }) + } + } +} diff --git a/nearcore/src/state_sync.rs b/nearcore/src/state_sync.rs index d3eab6ea9fc..5a6da19c019 100644 --- a/nearcore/src/state_sync.rs +++ b/nearcore/src/state_sync.rs @@ -7,7 +7,7 @@ use futures::FutureExt; use near_async::time::{Clock, Duration, Instant}; use near_chain::types::RuntimeAdapter; use near_chain::{Chain, ChainGenesis, ChainStoreAccess, DoomslugThresholdMode, Error}; -use near_chain_configs::{ClientConfig, ExternalStorageLocation, MutableConfigValue}; +use near_chain_configs::{ClientConfig, ExternalStorageLocation, MutableValidatorSigner}; use near_client::sync::external::{ create_bucket_readwrite, external_storage_location, StateFileType, }; @@ -22,7 +22,6 @@ use near_primitives::hash::CryptoHash; use near_primitives::state_part::PartId; use near_primitives::state_sync::{StatePartKey, StateSyncDumpProgress}; use near_primitives::types::{AccountId, EpochHeight, EpochId, ShardId, StateRoot}; -use near_primitives::validator_signer::ValidatorSigner; use near_store::DBCol; use rand::{thread_rng, Rng}; use std::collections::HashSet; @@ -39,7 +38,7 @@ pub struct StateSyncDumper { /// Contains validator key for this node. This field is mutable and optional. Use with caution! /// Lock the value of mutable validator signer for the duration of a request to ensure consistency. /// Please note that the locked value should not be stored anywhere or passed through the thread boundary. - pub validator: MutableConfigValue>>, + pub validator: MutableValidatorSigner, pub dump_future_runner: Box) -> Box>, pub handle: Option, } @@ -338,7 +337,7 @@ async fn state_sync_dump( restart_dump_for_shards: Vec, external: ExternalConnection, iteration_delay: Duration, - validator: MutableConfigValue>>, + validator: MutableValidatorSigner, keep_running: Arc, ) { tracing::info!(target: "state_sync_dump", shard_id, "Running StateSyncDump loop"); diff --git a/nightly/pytest-sanity.txt b/nightly/pytest-sanity.txt index af343e4bc7c..5668f049449 100644 --- a/nightly/pytest-sanity.txt +++ b/nightly/pytest-sanity.txt @@ -112,6 +112,8 @@ pytest --timeout=240 sanity/switch_node_key.py pytest --timeout=240 sanity/switch_node_key.py --features nightly pytest --timeout=240 sanity/validator_switch_key.py pytest --timeout=240 sanity/validator_switch_key.py --features nightly +pytest --timeout=120 sanity/validator_switch_key_quick.py +pytest --timeout=120 sanity/validator_switch_key_quick.py --features nightly pytest sanity/proxy_simple.py pytest sanity/proxy_simple.py --features nightly pytest sanity/proxy_restart.py diff --git a/pytest/lib/cluster.py b/pytest/lib/cluster.py index 76c2df1d324..f2a53abcc62 100644 --- a/pytest/lib/cluster.py +++ b/pytest/lib/cluster.py @@ -536,6 +536,11 @@ def kill(self, *, gentle=False): self._process.wait(5) self._process = None + def reload_updateable_config(self): + logger.info(f"Reloading updateable config for node {self.ordinal}.") + """Sends SIGHUP signal to the process in order to trigger updateable config reload.""" + self._process.send_signal(signal.SIGHUP) + def reset_data(self): shutil.rmtree(os.path.join(self.node_dir, "data")) diff --git a/pytest/tests/sanity/validator_switch_key_quick.py b/pytest/tests/sanity/validator_switch_key_quick.py new file mode 100755 index 00000000000..92588769fc6 --- /dev/null +++ b/pytest/tests/sanity/validator_switch_key_quick.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# Starts three validating nodes and one non-validating node +# Set a new validator key that has the same account id as one of +# the validating nodes. Stake that account with the new key +# and make sure that the network doesn't stall even after +# the non-validating node becomes a validator. + +import unittest +import sys, time +import pathlib + +sys.path.append(str(pathlib.Path(__file__).resolve().parents[2] / 'lib')) + +from configured_logger import logger +from cluster import start_cluster +from utils import wait_for_blocks + +EPOCH_LENGTH = 20 +TIMEOUT = 100 + + +class ValidatorSwitchKeyQuickTest(unittest.TestCase): + + def test_validator_switch_key_quick(self): + # It is important for the non-validating node to already track shards + # that it will be assigned to when becoming a validator. + config_map = { + 2: { + "tracked_shards": [0], + "store.load_mem_tries_for_tracked_shards": True, + } + } + + # Key will be moved from old_validator to new_validator, + # while the other_validator remains untouched. + [ + other_validator, + old_validator, + new_validator, + ] = start_cluster(2, 1, 3, None, + [["epoch_length", EPOCH_LENGTH], + ["block_producer_kickout_threshold", 10], + ["chunk_producer_kickout_threshold", 10]], + config_map) + wait_for_blocks(old_validator, count=2) + + new_validator.reset_validator_key(other_validator.validator_key) + other_validator.kill() + new_validator.reload_updateable_config() + new_validator.stop_checking_store() + wait_for_blocks(old_validator, count=2) + + block = old_validator.get_latest_block() + max_height = block.height + 4 * EPOCH_LENGTH + target_height = max_height - EPOCH_LENGTH // 2 + start_time = time.time() + + while True: + self.assertLess(time.time() - start_time, TIMEOUT, + 'Validators got stuck') + + info = old_validator.json_rpc('validators', 'latest') + next_validators = info['result']['next_validators'] + account_ids = [v['account_id'] for v in next_validators] + # We copied over 'test0' validator key, along with validator account ID. + # Therefore, despite nodes[0] being stopped, 'test0' still figures as active validator. + self.assertEqual(sorted(account_ids), ['test0', 'test1']) + + last_block_per_node = [ + new_validator.get_latest_block(), + old_validator.get_latest_block() + ] + height_per_node = list( + map(lambda block: block.height, last_block_per_node)) + logger.info(height_per_node) + + self.assertLess(max(height_per_node), max_height, + 'Nodes are not synced') + + synchronized = True + for i, node in enumerate([new_validator, old_validator]): + try: + node.get_block(last_block_per_node[1 - i].hash) + except Exception: + synchronized = False + break + + # Both validators should be synchronized + logger.info(f'Synchronized {synchronized}') + if synchronized and height_per_node[0] > target_height: + # If nodes are synchronized and the current height is close to `max_height` we can finish. + return + + wait_for_blocks(old_validator, count=1) + + +if __name__ == '__main__': + unittest.main()