Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Retry timeout #2438

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 9 additions & 9 deletions polkadot/node/core/candidate-validation/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -651,21 +651,22 @@ async fn validate_candidate_exhaustive(
);
Err(ValidationFailed(e.to_string()))
},
Err(ValidationError::Invalid(WasmInvalidCandidate::HardTimeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::Invalid(WasmInvalidCandidate::WorkerReportedInvalid(e))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))),

Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousWorkerDeath)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(
"ambiguous worker death".to_string(),
))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),

Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(format!(
"ambiguous job death: {err}"
)))),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) =>
Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))) =>
Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(err))),

Err(ValidationError::Preparation(e)) => {
gum::warn!(
target: LOG_TARGET,
Expand Down Expand Up @@ -766,21 +767,20 @@ trait ValidationBackend {
} else {
break;
},

Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(_))) =>
Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::JobError(_) | PossiblyInvalidError::HardTimeout,
)) =>
if num_job_error_retries_left > 0 {
num_job_error_retries_left -= 1;
} else {
break;
},

Err(ValidationError::Internal(_)) =>
if num_internal_retries_left > 0 {
num_internal_retries_left -= 1;
} else {
break;
},

Ok(_) | Err(ValidationError::Invalid(_) | ValidationError::Preparation(_)) => break,
}

Expand Down
12 changes: 6 additions & 6 deletions polkadot/node/core/candidate-validation/src/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -480,8 +480,8 @@ fn candidate_validation_bad_return_is_invalid() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::HardTimeout,
))),
validation_data,
validation_code,
Expand Down Expand Up @@ -758,8 +758,8 @@ fn candidate_validation_timeout_is_internal_error() {
let candidate_receipt = CandidateReceipt { descriptor, commitments_hash: Hash::zero() };

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::HardTimeout,
))),
validation_data,
validation_code,
Expand Down Expand Up @@ -852,8 +852,8 @@ fn candidate_validation_code_mismatch_is_invalid() {
let (_ctx, _ctx_handle) = test_helpers::make_subsystem_context::<AllMessages, _>(pool.clone());

let v = executor::block_on(validate_candidate_exhaustive(
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::Invalid(
WasmInvalidCandidate::HardTimeout,
MockValidateCandidateBackend::with_hardcoded_result(Err(ValidationError::PossiblyInvalid(
PossiblyInvalidError::HardTimeout,
))),
validation_data,
validation_code,
Expand Down
4 changes: 2 additions & 2 deletions polkadot/node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -44,8 +44,6 @@ pub enum InvalidCandidate {
/// The candidate is reported to be invalid by the execution worker. The string contains the
/// error message.
WorkerReportedInvalid(String),
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
}

/// Possibly transient issue that may resolve after retries.
Expand All @@ -70,6 +68,8 @@ pub enum PossiblyInvalidError {
///
/// We cannot treat this as an internal error because malicious code may have caused this.
AmbiguousJobDeath(String),
/// PVF execution (compilation is not included) took more time than was allotted.
HardTimeout,
/// An unexpected error occurred in the job process and we can't be sure whether the candidate
/// is really invalid or some internal glitch occurred. Whenever we are unsure, we can never
/// treat an error as internal as we would abstain from voting. This is bad because if the
Expand Down
8 changes: 4 additions & 4 deletions polkadot/node/core/pvf/src/execute/queue.rs
Original file line number Diff line number Diff line change
Expand Up @@ -346,10 +346,6 @@ fn handle_job_finish(
None,
),
Outcome::InternalError { err } => (None, Err(ValidationError::Internal(err)), None),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Outcome::HardTimeout =>
(None, Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)), None),
// "Maybe invalid" errors (will retry).
Outcome::WorkerIntfErr => (
None,
Expand All @@ -361,6 +357,10 @@ fn handle_job_finish(
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::AmbiguousJobDeath(err))),
None,
),
// Either the worker or the job timed out. Kill the worker in either case. Treated as
// definitely-invalid, because if we timed out, there's no time left for a retry.
Comment on lines +360 to +361
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could be reworded as it's potentially confusing. It is only definitely-invalid in backing where we don't retry the possibly-invalid candidates. In backing there's no time for a retry (and it's stricter).

Outcome::HardTimeout =>
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)), None),
Outcome::JobError { err } =>
(None, Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::JobError(err))), None),
};
Expand Down
13 changes: 7 additions & 6 deletions polkadot/node/core/pvf/tests/it/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
use assert_matches::assert_matches;
use parity_scale_codec::Encode as _;
use polkadot_node_core_pvf::{
start, testing::build_workers_and_get_paths, Config, InvalidCandidate, Metrics, PrepareError,
PrepareJobKind, PvfPrepData, ValidationError, ValidationHost, JOB_TIMEOUT_WALL_CLOCK_FACTOR,
start, testing::build_workers_and_get_paths, Config, Metrics, PossiblyInvalidError,
PrepareError, PrepareJobKind, PvfPrepData, ValidationError, ValidationHost,
JOB_TIMEOUT_WALL_CLOCK_FACTOR,
};
use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult};
use polkadot_primitives::{ExecutorParam, ExecutorParams};
Expand Down Expand Up @@ -162,7 +163,7 @@ async fn execute_job_terminates_on_timeout() {
.await;

match result {
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {},
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) => {},
r => panic!("{:?}", r),
}

Expand Down Expand Up @@ -202,8 +203,8 @@ async fn ensure_parallel_execution() {
assert_matches!(
(res1, res2),
(
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)),
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout))
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)),
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout))
)
);

Expand Down Expand Up @@ -344,7 +345,7 @@ async fn deleting_prepared_artifact_does_not_dispute() {
.await;

match result {
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout)) => {},
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout)) => {},
r => panic!("{:?}", r),
}
}
Expand Down
6 changes: 2 additions & 4 deletions polkadot/node/core/pvf/tests/it/process.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,7 @@

use super::TestHost;
use assert_matches::assert_matches;
use polkadot_node_core_pvf::{
InvalidCandidate, PossiblyInvalidError, PrepareError, ValidationError,
};
use polkadot_node_core_pvf::{PossiblyInvalidError, PrepareError, ValidationError};
use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams};
use procfs::process;
use rusty_fork::rusty_fork_test;
Expand Down Expand Up @@ -153,7 +151,7 @@ rusty_fork_test! {

assert_matches!(
result,
Err(ValidationError::Invalid(InvalidCandidate::HardTimeout))
Err(ValidationError::PossiblyInvalid(PossiblyInvalidError::HardTimeout))
);
})
}
Expand Down
Loading