Skip to content

Commit

Permalink
availability-recovery: bump chunk fetch threshold to 1MB for Polkadot…
Browse files Browse the repository at this point in the history
… and 4MB for Kusama + testnets (#4399)

Doing this change ensures that we minimize the CPU usage we spend in
reed-solomon by only doing the re-encoding into chunks if PoV size is
less than 4MB (which means all PoVs right now)
 
Based on susbystem benchmark results we concluded that it is safe to
bump this number higher. At worst case scenario the network pressure for
a backing group of 5 is around 25% of the network bandwidth in hw specs.

Assuming 6s block times (max_candidate_depth 3) and needed_approvals 30
the amount of bandwidth usage of a backing group used would hover above
`30 * 4 * 3 = 360MB` per relay chain block. Given a backing group of 5
that gives 72MB per block per validator -> 12 MB/s.

<details>
<summary>Reality check on Kusama PoV sizes (click for chart)</summary>
<br>
<img width="697" alt="Screenshot 2024-05-07 at 14 30 38"
src="https://github.com/paritytech/polkadot-sdk/assets/54316454/bfed32d4-8623-48b0-9ec0-8b95dd2a9d8c">
</details>

---------

Signed-off-by: Andrei Sandu <[email protected]>
  • Loading branch information
sandreim authored May 24, 2024
1 parent 1c7a1a5 commit f469fbf
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 9 deletions.
22 changes: 15 additions & 7 deletions polkadot/node/network/availability-recovery/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,10 @@ const LRU_SIZE: u32 = 16;

const COST_INVALID_REQUEST: Rep = Rep::CostMajor("Peer sent unparsable request");

/// PoV size limit in bytes for which prefer fetching from backers.
const SMALL_POV_LIMIT: usize = 128 * 1024;
/// PoV size limit in bytes for which prefer fetching from backers. (conservative, Polkadot for now)
pub(crate) const CONSERVATIVE_FETCH_CHUNKS_THRESHOLD: usize = 1 * 1024 * 1024;
/// PoV size limit in bytes for which prefer fetching from backers. (Kusama and all testnets)
pub const FETCH_CHUNKS_THRESHOLD: usize = 4 * 1024 * 1024;

#[derive(Clone, PartialEq)]
/// The strategy we use to recover the PoV.
Expand Down Expand Up @@ -448,7 +450,7 @@ async fn handle_recover<Context>(
if let Some(backing_validators) = session_info.validator_groups.get(backing_group) {
let mut small_pov_size = true;

if let RecoveryStrategyKind::BackersFirstIfSizeLower(small_pov_limit) =
if let RecoveryStrategyKind::BackersFirstIfSizeLower(fetch_chunks_threshold) =
recovery_strategy_kind
{
// Get our own chunk size to get an estimate of the PoV size.
Expand All @@ -457,13 +459,13 @@ async fn handle_recover<Context>(
if let Ok(Some(chunk_size)) = chunk_size {
let pov_size_estimate =
chunk_size.saturating_mul(session_info.validators.len()) / 3;
small_pov_size = pov_size_estimate < small_pov_limit;
small_pov_size = pov_size_estimate < fetch_chunks_threshold;

gum::trace!(
target: LOG_TARGET,
?candidate_hash,
pov_size_estimate,
small_pov_limit,
fetch_chunks_threshold,
enabled = small_pov_size,
"Prefer fetch from backing group",
);
Expand Down Expand Up @@ -547,11 +549,14 @@ impl AvailabilityRecoverySubsystem {
/// which never requests the `AvailabilityStoreSubsystem` subsystem and only checks the POV hash
/// instead of reencoding the available data.
pub fn for_collator(
fetch_chunks_threshold: Option<usize>,
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
metrics: Metrics,
) -> Self {
Self {
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
),
bypass_availability_store: true,
post_recovery_check: PostRecoveryCheck::PovHash,
req_receiver,
Expand Down Expand Up @@ -591,11 +596,14 @@ impl AvailabilityRecoverySubsystem {
/// Create a new instance of `AvailabilityRecoverySubsystem` which requests chunks if PoV is
/// above a threshold.
pub fn with_chunks_if_pov_large(
fetch_chunks_threshold: Option<usize>,
req_receiver: IncomingRequestReceiver<request_v1::AvailableDataFetchingRequest>,
metrics: Metrics,
) -> Self {
Self {
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(SMALL_POV_LIMIT),
recovery_strategy_kind: RecoveryStrategyKind::BackersFirstIfSizeLower(
fetch_chunks_threshold.unwrap_or(CONSERVATIVE_FETCH_CHUNKS_THRESHOLD),
),
bypass_availability_store: false,
post_recovery_check: PostRecoveryCheck::Reencode,
req_receiver,
Expand Down
6 changes: 4 additions & 2 deletions polkadot/node/network/availability-recovery/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -906,6 +906,7 @@ fn recovers_from_only_chunks_if_pov_large() {
let test_state = TestState::default();
let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
Some(FETCH_CHUNKS_THRESHOLD),
request_receiver(&req_protocol_names),
Metrics::new_dummy(),
);
Expand Down Expand Up @@ -942,7 +943,7 @@ fn recovers_from_only_chunks_if_pov_large() {
AllMessages::AvailabilityStore(
AvailabilityStoreMessage::QueryChunkSize(_, tx)
) => {
let _ = tx.send(Some(1000000));
let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
}
);

Expand Down Expand Up @@ -987,7 +988,7 @@ fn recovers_from_only_chunks_if_pov_large() {
AllMessages::AvailabilityStore(
AvailabilityStoreMessage::QueryChunkSize(_, tx)
) => {
let _ = tx.send(Some(1000000));
let _ = tx.send(Some(crate::FETCH_CHUNKS_THRESHOLD + 1));
}
);

Expand Down Expand Up @@ -1015,6 +1016,7 @@ fn fast_path_backing_group_recovers_if_pov_small() {
let test_state = TestState::default();
let req_protocol_names = ReqProtocolNames::new(&GENESIS_HASH, None);
let subsystem = AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
Some(FETCH_CHUNKS_THRESHOLD),
request_receiver(&req_protocol_names),
Metrics::new_dummy(),
);
Expand Down
7 changes: 7 additions & 0 deletions polkadot/node/service/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -750,6 +750,7 @@ pub fn new_full<
prepare_workers_hard_max_num,
}: NewFullParams<OverseerGenerator>,
) -> Result<NewFull, Error> {
use polkadot_availability_recovery::FETCH_CHUNKS_THRESHOLD;
use polkadot_node_network_protocol::request_response::IncomingRequest;
use sc_network_sync::WarpSyncParams;

Expand Down Expand Up @@ -988,6 +989,11 @@ pub fn new_full<
stagnant_check_interval: Default::default(),
stagnant_check_mode: chain_selection_subsystem::StagnantCheckMode::PruneOnly,
};

// Kusama + testnets get a higher threshold, we are conservative on Polkadot for now.
let fetch_chunks_threshold =
if config.chain_spec.is_polkadot() { None } else { Some(FETCH_CHUNKS_THRESHOLD) };

Some(ExtendedOverseerGenArgs {
keystore,
parachains_db,
Expand All @@ -1001,6 +1007,7 @@ pub fn new_full<
dispute_req_receiver,
dispute_coordinator_config,
chain_selection_config,
fetch_chunks_threshold,
})
};

Expand Down
7 changes: 7 additions & 0 deletions polkadot/node/service/src/overseer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,10 @@ pub struct ExtendedOverseerGenArgs {
pub dispute_coordinator_config: DisputeCoordinatorConfig,
/// Configuration for the chain selection subsystem.
pub chain_selection_config: ChainSelectionConfig,
/// Optional availability recovery fetch chunks threshold. If PoV size size is lower
/// than the value put in here we always try to recovery availability from backers.
/// The presence of this parameter here is needed to have different values per chain.
pub fetch_chunks_threshold: Option<usize>,
}

/// Obtain a prepared validator `Overseer`, that is initialized with all default values.
Expand Down Expand Up @@ -166,6 +170,7 @@ pub fn validator_overseer_builder<Spawner, RuntimeClient>(
dispute_req_receiver,
dispute_coordinator_config,
chain_selection_config,
fetch_chunks_threshold,
}: ExtendedOverseerGenArgs,
) -> Result<
InitializedOverseerBuilder<
Expand Down Expand Up @@ -240,6 +245,7 @@ where
Metrics::register(registry)?,
))
.availability_recovery(AvailabilityRecoverySubsystem::with_chunks_if_pov_large(
fetch_chunks_threshold,
available_data_req_receiver,
Metrics::register(registry)?,
))
Expand Down Expand Up @@ -421,6 +427,7 @@ where
))
.availability_distribution(DummySubsystem)
.availability_recovery(AvailabilityRecoverySubsystem::for_collator(
None,
available_data_req_receiver,
Metrics::register(registry)?,
))
Expand Down

0 comments on commit f469fbf

Please sign in to comment.