From 3f41297f424c9a210da039261b4405a3fffdd8bd Mon Sep 17 00:00:00 2001 From: Julian Eager Date: Wed, 22 Nov 2023 11:10:46 +0800 Subject: [PATCH 1/3] InvalidCandidate::HardTimeout -> PossiblyInvalidError::HardTimeout --- polkadot/node/core/candidate-validation/src/lib.rs | 11 ++++++----- .../node/core/candidate-validation/src/tests.rs | 12 ++++++------ polkadot/node/core/pvf/src/error.rs | 4 ++-- polkadot/node/core/pvf/src/execute/queue.rs | 8 ++++---- polkadot/node/core/pvf/tests/it/main.rs | 13 +++++++------ polkadot/node/core/pvf/tests/it/process.rs | 2 +- 6 files changed, 26 insertions(+), 24 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 9f7b17f61299..6db8e65e227b 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -651,21 +651,22 @@ async fn validate_candidate_exhaustive( ); Err(ValidationFailed(e.to_string())) }, - Err(ValidationError::Invalid(WasmInvalidCandidate::HardTimeout)) => - Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)), Err(ValidationError::Invalid(WasmInvalidCandidate::WorkerReportedInvalid(e))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError( "ambiguous worker death".to_string(), ))), - Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) => - Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), - Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) => Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!( "ambiguous job death: {err}" )))), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) => + Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))), + Err(ValidationError::Preparation(e)) => { gum::warn!( target: LOG_TARGET, diff --git a/polkadot/node/core/candidate-validation/src/tests.rs b/polkadot/node/core/candidate-validation/src/tests.rs index 110785804652..14e354a467da 100644 --- a/polkadot/node/core/candidate-validation/src/tests.rs +++ b/polkadot/node/core/candidate-validation/src/tests.rs @@ -480,8 +480,8 @@ fn candidate_validation_bad_return_is_invalid() { let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() }; let v = executor::block_on(validate_candidate_exhaustive( - MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid( - WasmInvalidCandidate::HardTimeout, + MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid( + PossiblyInvalidError::HardTimeout, ))), validation_data, validation_code, @@ -758,8 +758,8 @@ fn candidate_validation_timeout_is_internal_error() { let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() }; let v = executor::block_on(validate_candidate_exhaustive( - MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid( - WasmInvalidCandidate::HardTimeout, + MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid( + PossiblyInvalidError::HardTimeout, ))), validation_data, validation_code, @@ -852,8 +852,8 @@ fn candidate_validation_code_mismatch_is_invalid() { let (_ctx, _ctx_handle) = test_helpers::make_subsystem_context::(pool.clone()); let v = executor::block_on(validate_candidate_exhaustive( - MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid( - WasmInvalidCandidate::HardTimeout, + MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid( + PossiblyInvalidError::HardTimeout, ))), validation_data, validation_code, diff --git a/polkadot/node/core/pvf/src/error.rs b/polkadot/node/core/pvf/src/error.rs index 442443f326e9..f5ea5df8207c 100644 --- a/polkadot/node/core/pvf/src/error.rs +++ b/polkadot/node/core/pvf/src/error.rs @@ -44,8 +44,6 @@ pub enum InvalidCandidate { /// The candidate is reported to be invalid by the execution worker. The string contains the /// error message. WorkerReportedInvalid(String), - /// PVF execution (compilation is not included) took more time than was allotted. - HardTimeout, } /// Possibly transient issue that may resolve after retries. @@ -70,6 +68,8 @@ pub enum PossiblyInvalidError { /// /// We cannot treat this as an internal error because malicious code may have caused this. AmbiguousJobDeath(String), + /// PVF execution (compilation is not included) took more time than was allotted. + HardTimeout, /// An unexpected error occurred in the job process and we can't be sure whether the candidate /// is really invalid or some internal glitch occurred. Whenever we are unsure, we can never /// treat an error as internal as we would abstain from voting. This is bad because if the diff --git a/polkadot/node/core/pvf/src/execute/queue.rs b/polkadot/node/core/pvf/src/execute/queue.rs index a0c24fd44323..91dfa064d106 100644 --- a/polkadot/node/core/pvf/src/execute/queue.rs +++ b/polkadot/node/core/pvf/src/execute/queue.rs @@ -346,10 +346,6 @@ fn handle_job_finish( None, ), Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None), - // Either the worker or the job timed out. Kill the worker in either case. Treated as - // definitely-invalid, because if we timed out, there's no time left for a retry. - Outcome::HardTimeout => - (None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None), // "Maybe invalid" errors (will retry). Outcome::WorkerIntfErr => ( None, @@ -361,6 +357,10 @@ fn handle_job_finish( Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))), None, ), + // Either the worker or the job timed out. Kill the worker in either case. Treated as + // definitely-invalid, because if we timed out, there's no time left for a retry. + Outcome::HardTimeout => + (None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)), None), Outcome::JobError { err } => (None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None), }; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 4c81ac502dd4..839ffaad97ff 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -19,8 +19,9 @@ use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ - start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError, - PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, + PossiblyInvalidError, PrepareError, PrepareJobKind, PvfPrepData, ValidationError, + ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::{ExecutorParam, ExecutorParams}; @@ -162,7 +163,7 @@ async fn execute_job_terminates_on_timeout() { .await; match result { - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {}, + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) => {}, r => panic!("{:?}", r), } @@ -202,8 +203,8 @@ async fn ensure_parallel_execution() { assert_matches!( (res1, res2), ( - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)), + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) ) ); @@ -344,7 +345,7 @@ async fn deleting_prepared_artifact_does_not_dispute() { .await; match result { - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {}, + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) => {}, r => panic!("{:?}", r), } } diff --git a/polkadot/node/core/pvf/tests/it/process.rs b/polkadot/node/core/pvf/tests/it/process.rs index b742acb15d02..ab50c19470d9 100644 --- a/polkadot/node/core/pvf/tests/it/process.rs +++ b/polkadot/node/core/pvf/tests/it/process.rs @@ -153,7 +153,7 @@ rusty_fork_test! { assert_matches!( result, - Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) + Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) ); }) } From 7f7d8a351cbe9ae1bdda9091f342ec80e45d8092 Mon Sep 17 00:00:00 2001 From: Julian Eager Date: Wed, 22 Nov 2023 12:33:21 +0800 Subject: [PATCH 2/3] retry on job timeout --- polkadot/node/core/candidate-validation/src/lib.rs | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 6db8e65e227b..d371637a5dc1 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -767,21 +767,20 @@ trait ValidationBackend { } else { break; }, - - Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) => + Err(ValidationError::PossiblyInvalid( + PossiblyInvalidError::JobError(_) | PossiblyInvalidError::HardTimeout, + )) => if num_job_error_retries_left > 0 { num_job_error_retries_left -= 1; } else { break; }, - Err(ValidationError::Internal(_)) => if num_internal_retries_left > 0 { num_internal_retries_left -= 1; } else { break; }, - Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break, } From ed52f944dde225364899071a34789466e0d0bb02 Mon Sep 17 00:00:00 2001 From: Julian Eager Date: Wed, 22 Nov 2023 14:15:23 +0800 Subject: [PATCH 3/3] rm unused imports --- polkadot/node/core/pvf/tests/it/main.rs | 6 +++--- polkadot/node/core/pvf/tests/it/process.rs | 4 +--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index 839ffaad97ff..22e828d135f0 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -19,9 +19,9 @@ use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ - start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, - PossiblyInvalidError, PrepareError, PrepareJobKind, PvfPrepData, ValidationError, - ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR, + start, testing::build_workers_and_get_paths, Config, Metrics, PossiblyInvalidError, + PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, + JOB_TIMEOUT_WALL_CLOCK_FACTOR, }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::{ExecutorParam, ExecutorParams}; diff --git a/polkadot/node/core/pvf/tests/it/process.rs b/polkadot/node/core/pvf/tests/it/process.rs index ab50c19470d9..08279297a47a 100644 --- a/polkadot/node/core/pvf/tests/it/process.rs +++ b/polkadot/node/core/pvf/tests/it/process.rs @@ -19,9 +19,7 @@ use super::TestHost; use assert_matches::assert_matches; -use polkadot_node_core_pvf::{ - InvalidCandidate, PossiblyInvalidError, PrepareError, ValidationError, -}; +use polkadot_node_core_pvf::{PossiblyInvalidError, PrepareError, ValidationError}; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams}; use procfs::process; use rusty_fork::rusty_fork_test;