From 9faea380dce6db9aabba29cb01328df229764863 Mon Sep 17 00:00:00 2001 From: Marcin S Date: Tue, 31 Oct 2023 11:08:08 +0100 Subject: [PATCH] PVF worker: Add seccomp restrictions (restrict networking) (#2009) --- Cargo.lock | 63 ++- .../node/core/candidate-validation/src/lib.rs | 3 +- polkadot/node/core/pvf/Cargo.toml | 5 + .../benches/host_prepare_rococo_runtime.rs | 18 +- polkadot/node/core/pvf/common/Cargo.toml | 3 +- polkadot/node/core/pvf/common/src/lib.rs | 5 +- .../node/core/pvf/common/src/worker/mod.rs | 57 +- .../core/pvf/common/src/worker/security.rs | 512 ------------------ .../common/src/worker/security/landlock.rs | 325 +++++++++++ .../pvf/common/src/worker/security/mod.rs | 189 +++++++ .../pvf/common/src/worker/security/seccomp.rs | 201 +++++++ .../node/core/pvf/execute-worker/Cargo.toml | 1 - .../node/core/pvf/execute-worker/src/lib.rs | 4 +- .../node/core/pvf/prepare-worker/Cargo.toml | 1 - .../node/core/pvf/prepare-worker/src/lib.rs | 9 +- .../pvf/prepare-worker/src/memory_stats.rs | 2 +- .../node/core/pvf/src/execute/worker_intf.rs | 35 +- polkadot/node/core/pvf/src/host.rs | 126 +---- polkadot/node/core/pvf/src/lib.rs | 1 + .../node/core/pvf/src/prepare/worker_intf.rs | 37 +- polkadot/node/core/pvf/src/security.rs | 312 +++++++++++ polkadot/node/core/pvf/src/worker_intf.rs | 5 +- polkadot/node/core/pvf/tests/it/adder.rs | 17 +- polkadot/node/core/pvf/tests/it/main.rs | 128 ++++- .../src/node/utility/pvf-host-and-workers.md | 13 + .../list-syscalls/execute-worker-syscalls | 9 - .../list-syscalls/prepare-worker-syscalls | 9 - 27 files changed, 1376 insertions(+), 714 deletions(-) delete mode 100644 polkadot/node/core/pvf/common/src/worker/security.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/landlock.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/mod.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/seccomp.rs create mode 100644 polkadot/node/core/pvf/src/security.rs diff --git a/Cargo.lock b/Cargo.lock index c95cc70b0da4..c368a957764e 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5642,7 +5642,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] @@ -6596,7 +6596,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] @@ -7049,9 +7049,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libflate" @@ -7635,9 +7635,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux-raw-sys" -version = "0.4.5" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lioness" @@ -12279,8 +12279,10 @@ dependencies = [ "polkadot-node-primitives", "polkadot-parachain-primitives", "polkadot-primitives", + "procfs", "rand 0.8.5", "rococo-runtime", + "rusty-fork", "slotmap", "sp-core", "sp-maybe-compressed-blob", @@ -12331,12 +12333,13 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", + "seccompiler", "sp-core", "sp-externalities", "sp-io", "sp-tracing", "tempfile", - "tokio", + "thiserror", "tracing-gum", ] @@ -12354,7 +12357,6 @@ dependencies = [ "sp-core", "sp-maybe-compressed-blob", "sp-tracing", - "tokio", "tracing-gum", ] @@ -12377,7 +12379,6 @@ dependencies = [ "sp-maybe-compressed-blob", "sp-tracing", "tikv-jemalloc-ctl", - "tokio", "tracing-gum", ] @@ -13525,6 +13526,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.0", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.21", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.0", + "chrono", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -14452,14 +14479,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.8" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ "bitflags 2.4.0", "errno", "libc", - "linux-raw-sys 0.4.5", + "linux-raw-sys 0.4.10", "windows-sys 0.48.0", ] @@ -14556,6 +14583,7 @@ dependencies = [ "fnv", "quick-error", "tempfile", + "wait-timeout", ] [[package]] @@ -16182,6 +16210,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "seccompiler" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "345a3e4dddf721a478089d4697b83c6c0a8f5bf16086f6c13397e4534eb6e2e5" +dependencies = [ + "libc", +] + [[package]] name = "secp256k1" version = "0.24.3" @@ -18410,7 +18447,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.0", "redox_syscall 0.3.5", - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 21a7121d47bd..93db7d11cee8 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -149,7 +149,8 @@ async fn run( exec_worker_path, ), pvf_metrics, - ); + ) + .await; ctx.spawn_blocking("pvf-validation-host", task.boxed())?; loop { diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index bfd70c6fbd41..430f7cd5e8ef 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -39,6 +39,7 @@ polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = tr assert_matches = "1.4.0" criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support", "async_tokio"] } hex-literal = "0.4.1" + polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] } # For benches and integration tests, depend on ourselves with the test-utils # feature. @@ -48,6 +49,10 @@ rococo-runtime = { path = "../../../runtime/rococo" } adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" } halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" } +[target.'cfg(target_os = "linux")'.dev-dependencies] +procfs = "0.16.0" +rusty-fork = "0.3.0" + [[bench]] name = "host_prepare_rococo_runtime" harness = false diff --git a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs index 3069fa2b194b..acd80526262c 100644 --- a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs +++ b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs @@ -17,18 +17,15 @@ //! Benchmarks for preparation through the host. We use a real PVF to get realistic results. use criterion::{criterion_group, criterion_main, BatchSize, Criterion, SamplingMode}; -use parity_scale_codec::Encode; use polkadot_node_core_pvf::{ start, testing, Config, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData, - ValidationError, ValidationHost, + ValidationHost, }; -use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::ExecutorParams; use rococo_runtime::WASM_BINARY; use std::time::Duration; use tokio::{runtime::Handle, sync::Mutex}; -const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3); const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30); struct TestHost { @@ -36,7 +33,7 @@ struct TestHost { } impl TestHost { - fn new_with_config(handle: &Handle, f: F) -> Self + async fn new_with_config(handle: &Handle, f: F) -> Self where F: FnOnce(&mut Config), { @@ -50,7 +47,7 @@ impl TestHost { execute_worker_path, ); f(&mut config); - let (host, task) = start(config, Metrics::default()); + let (host, task) = start(config, Metrics::default()).await; let _ = handle.spawn(task); Self { host: Mutex::new(host) } } @@ -107,15 +104,18 @@ fn host_prepare_rococo_runtime(c: &mut Criterion) { group.measurement_time(Duration::from_secs(240)); group.bench_function("host: prepare Rococo runtime", |b| { b.to_async(&rt).iter_batched( - || { + || async { ( TestHost::new_with_config(rt.handle(), |cfg| { cfg.prepare_workers_hard_max_num = 1; - }), + }) + .await, pvf.clone().code(), ) }, - |(host, pvf_code)| async move { + |result| async move { + let (host, pvf_code) = result.await; + // `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the // benchmark accuracy. let _stats = host.precheck_pvf(&pvf_code, Default::default()).await.unwrap(); diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 5fe2c6b6845c..7dc8d307026e 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -12,7 +12,6 @@ cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" -tokio = { version = "1.24.2", features = ["fs", "process", "io-util"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } @@ -30,6 +29,8 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.3.0" +seccompiler = "0.4.0" +thiserror = "1.0.31" [dev-dependencies] assert_matches = "1.4.0" diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 53c287ea9709..e2211b97d87b 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -32,10 +32,9 @@ pub use sp_tracing; const LOG_TARGET: &str = "parachain::pvf-common"; use std::{ - io::{Read, Write}, + io::{self, Read, Write}, mem, }; -use tokio::io; #[cfg(feature = "test-utils")] pub mod tests { @@ -50,6 +49,8 @@ pub mod tests { pub struct SecurityStatus { /// Whether the landlock features we use are fully available on this system. pub can_enable_landlock: bool, + /// Whether the seccomp features we use are fully available on this system. + pub can_enable_seccomp: bool, // Whether we are able to unshare the user namespace and change the filesystem root. pub can_unshare_user_namespace_and_change_root: bool, } diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index e7b996ccdc3d..274a2fc80397 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -23,13 +23,12 @@ use cpu_time::ProcessTime; use futures::never::Never; use std::{ any::Any, - fmt, + fmt, io, os::unix::net::UnixStream, path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, }; -use tokio::{io, runtime::Runtime}; /// Use this macro to declare a `fn main() {}` that will create an executable that can be used for /// spawning the desired worker. @@ -85,6 +84,13 @@ macro_rules! decl_worker_main { let status = -1; std::process::exit(status) }, + "--check-can-enable-seccomp" => { + #[cfg(all(target_os = "linux", target_arch = "x86_64"))] + let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 }; + #[cfg(not(all(target_os = "linux", target_arch = "x86_64")))] + let status = -1; + std::process::exit(status) + }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] let status = if let Err(err) = security::unshare_user_namespace_and_change_root( @@ -129,6 +135,7 @@ macro_rules! decl_worker_main { let mut worker_dir_path = None; let mut node_version = None; let mut can_enable_landlock = false; + let mut can_enable_seccomp = false; let mut can_unshare_user_namespace_and_change_root = false; let mut i = 2; @@ -147,6 +154,7 @@ macro_rules! decl_worker_main { i += 1 }, "--can-enable-landlock" => can_enable_landlock = true, + "--can-enable-seccomp" => can_enable_seccomp = true, "--can-unshare-user-namespace-and-change-root" => can_unshare_user_namespace_and_change_root = true, arg => panic!("Unexpected argument found: {}", arg), @@ -161,6 +169,7 @@ macro_rules! decl_worker_main { let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); let security_status = $crate::SecurityStatus { can_enable_landlock, + can_enable_seccomp, can_unshare_user_namespace_and_change_root, }; @@ -198,7 +207,7 @@ impl fmt::Display for WorkerKind { // The worker version must be passed in so that we accurately get the version of the worker, and not // the version that this crate was compiled with. -pub fn worker_event_loop( +pub fn worker_event_loop( worker_kind: WorkerKind, socket_path: PathBuf, #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf, @@ -207,8 +216,7 @@ pub fn worker_event_loop( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus, mut event_loop: F, ) where - F: FnMut(UnixStream, PathBuf) -> Fut, - Fut: futures::Future>, + F: FnMut(UnixStream, PathBuf) -> io::Result, { let worker_pid = std::process::id(); gum::debug!( @@ -262,7 +270,7 @@ pub fn worker_event_loop( } // Connect to the socket. - let stream = || -> std::io::Result { + let stream = || -> io::Result { let stream = UnixStream::connect(&socket_path)?; let _ = std::fs::remove_file(&socket_path); Ok(stream) @@ -317,6 +325,24 @@ pub fn worker_event_loop( let landlock_status = security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { + // We previously were able to enable, so this should never happen. + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "could not fully enable landlock: {:?}. This should not happen, please report an issue", + landlock_status + ); + } + } + + // TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI + // job to catch regressions. See . + #[cfg(all(target_os = "linux", target_arch = "x86_64"))] + if security_status.can_enable_seccomp { + let seccomp_status = + security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); + if !matches!(seccomp_status, Ok(())) { // We previously were able to enable, so this should never happen. // // TODO: Make this a real error in secure-mode. See: @@ -325,8 +351,8 @@ pub fn worker_event_loop( target: LOG_TARGET, %worker_kind, %worker_pid, - "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", - landlock_status + "could not fully enable seccomp: {:?}. This should not happen, please report an issue", + seccomp_status ); } } @@ -346,18 +372,11 @@ pub fn worker_event_loop( } // Run the main worker loop. - let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); - let err = rt - .block_on(event_loop(stream, worker_dir_path)) + let err = event_loop(stream, worker_dir_path) // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); worker_shutdown_message(worker_kind, worker_pid, &err.to_string()); - - // We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast - // as possible and not wait for stalled validation to finish. This isn't strictly necessary now, - // but may be in the future. - rt.shutdown_background(); } /// Provide a consistent message on worker shutdown. @@ -438,7 +457,7 @@ fn kill_parent_node_in_emergency() { /// The motivation for this module is to coordinate worker threads without using async Rust. pub mod thread { use std::{ - panic, + io, panic, sync::{Arc, Condvar, Mutex}, thread, time::Duration, @@ -479,7 +498,7 @@ pub mod thread { f: F, cond: Cond, outcome: WaitOutcome, - ) -> std::io::Result> + ) -> io::Result> where F: FnOnce() -> R, F: Send + 'static + panic::UnwindSafe, @@ -497,7 +516,7 @@ pub mod thread { cond: Cond, outcome: WaitOutcome, stack_size: usize, - ) -> std::io::Result> + ) -> io::Result> where F: FnOnce() -> R, F: Send + 'static + panic::UnwindSafe, diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs deleted file mode 100644 index 1b7614177448..000000000000 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ /dev/null @@ -1,512 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Functionality for securing workers. -//! -//! This is needed because workers are used to compile and execute untrusted code (PVFs). -//! -//! We currently employ the following security measures: -//! -//! - Restrict filesystem -//! - Use Landlock to remove all unnecessary FS access rights. -//! - Unshare the user and mount namespaces. -//! - Change the root directory to a worker-specific temporary directory. -//! - Remove env vars - -use crate::{worker::WorkerKind, LOG_TARGET}; - -/// Unshare the user namespace and change root to be the artifact directory. -/// -/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: -/// "CLONE_NEWUSER requires that the calling process is not threaded." -#[cfg(target_os = "linux")] -pub fn unshare_user_namespace_and_change_root( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &std::path::Path, -) -> Result<(), String> { - use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; - - // The following was copied from the `cstr_core` crate. - // - // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 - #[inline] - #[doc(hidden)] - const fn cstr_is_valid(bytes: &[u8]) -> bool { - if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { - return false - } - - let mut index = 0; - while index < bytes.len() - 1 { - if bytes[index] == 0 { - return false - } - index += 1; - } - true - } - - macro_rules! cstr { - ($e:expr) => {{ - const STR: &[u8] = concat!($e, "\0").as_bytes(); - const STR_VALID: bool = cstr_is_valid(STR); - let _ = [(); 0 - (!(STR_VALID) as usize)]; - #[allow(unused_unsafe)] - unsafe { - core::ffi::CStr::from_bytes_with_nul_unchecked(STR) - } - }} - } - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "unsharing the user namespace and calling pivot_root", - ); - - let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) - .expect("on unix; the path will never contain 0 bytes; qed"); - - // Wrapper around all the work to prevent repetitive error handling. - // - // # Errors - // - // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does - // not give the context of which call failed, so we return a &str error. - || -> Result<(), &'static str> { - // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps - // (2) and (3) are adapted from the example in pivot_root(2), with the additional - // change described in the `pivot_root(".", ".")` section. - unsafe { - // 1. `unshare` the user and the mount namespaces. - if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { - return Err("unshare user and mount namespaces") - } - - // 2. Setup mounts. - // - // Ensure that new root and its parent mount don't have shared propagation (which would - // cause pivot_root() to return an error), and prevent propagation of mount events to - // the initial mount namespace. - if libc::mount( - ptr::null(), - cstr!("/").as_ptr(), - ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - ptr::null(), - ) < 0 - { - return Err("mount MS_PRIVATE") - } - // Ensure that the new root is a mount point. - let additional_flags = - if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { - libc::MS_RDONLY - } else { - 0 - }; - if libc::mount( - worker_dir_path_c.as_ptr(), - worker_dir_path_c.as_ptr(), - ptr::null(), // ignored when MS_BIND is used - libc::MS_BIND | - libc::MS_REC | libc::MS_NOEXEC | - libc::MS_NODEV | libc::MS_NOSUID | - libc::MS_NOATIME | additional_flags, - ptr::null(), // ignored when MS_BIND is used - ) < 0 - { - return Err("mount MS_BIND") - } - - // 3. `pivot_root` to the artifact directory. - if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { - return Err("chdir to worker dir path") - } - if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { - return Err("pivot_root") - } - if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { - return Err("umount the old root mount point") - } - } - - Ok(()) - }() - .map_err(|err_ctx| { - let err = std::io::Error::last_os_error(); - format!("{}: {}", err_ctx, err) - })?; - - // Do some assertions. - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected current dir after pivot_root to be `/`".into()) - } - env::set_current_dir("..").map_err(|err| err.to_string())?; - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected not to be able to break out of new root by doing `..`".into()) - } - - Ok(()) -} - -/// Require env vars to have been removed when spawning the process, to prevent malicious code from -/// accessing them. -pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { - let mut ok = true; - - for (key, value) in std::env::vars_os() { - // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of - // randomness for malicious code. In the future we can remove it also and log in the host; - // see . - if key == "RUST_LOG" { - continue - } - // An exception for MacOS. This is not a secure platform anyway, so we let it slide. - #[cfg(target_os = "macos")] - if key == "__CF_USER_TEXT_ENCODING" { - continue - } - - gum::error!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?key, - ?value, - "env var was present that should have been removed", - ); - - ok = false; - } - - ok -} - -/// The [landlock] docs say it best: -/// -/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict -/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security -/// sandboxes as new security layers in addition to the existing system-wide access-controls. This -/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or -/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, -/// to securely restrict themselves." -/// -/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html -#[cfg(target_os = "linux")] -pub mod landlock { - pub use landlock::RulesetStatus; - - use crate::{worker::WorkerKind, LOG_TARGET}; - use landlock::*; - use std::{ - fmt, - path::{Path, PathBuf}, - }; - - /// Landlock ABI version. We use ABI V1 because: - /// - /// 1. It is supported by our reference kernel version. - /// 2. Later versions do not (yet) provide additional security that would benefit us. - /// - /// # Versions (as of October 2023) - /// - /// - Polkadot reference kernel version: 5.16+ - /// - /// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. - /// - /// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During - /// execution an attacker can only affect the name of a symlinked artifact and not the - /// original one. - /// - /// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can - /// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we - /// plan to check for file integrity anyway; see - /// . - /// - /// # Determinism - /// - /// You may wonder whether we could always use the latest ABI instead of only the ABI supported - /// by the reference kernel version. It seems plausible, since landlock provides a best-effort - /// approach to enabling sandboxing. For example, if the reference version only supported V1 and - /// we were on V2, then landlock would use V2 if it was supported on the current machine, and - /// just fall back to V1 if not. - /// - /// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, - /// they may have different semantics on some PVFs. So a malicious PVF now has a new attack - /// vector: they can exploit this indeterminism between landlock ABIs! - /// - /// On the other hand we do want validators to be as secure as possible and protect their keys - /// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy - /// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version - /// supports it or if it introduces some new feature that is beneficial to security. - pub const LANDLOCK_ABI: ABI = ABI::V1; - - #[derive(Debug)] - pub enum TryRestrictError { - InvalidExceptionPath(PathBuf), - RulesetError(RulesetError), - } - - impl From for TryRestrictError { - fn from(err: RulesetError) -> Self { - Self::RulesetError(err) - } - } - - impl fmt::Display for TryRestrictError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path), - Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()), - } - } - } - - impl std::error::Error for TryRestrictError {} - - /// Try to enable landlock for the given kind of worker. - pub fn enable_for_worker( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &Path, - ) -> Result> { - let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { - WorkerKind::Prepare => { - vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] - }, - WorkerKind::Execute => { - vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] - }, - WorkerKind::CheckPivotRoot => - panic!("this should only be passed for checking pivot_root; qed"), - }; - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "enabling landlock with exceptions: {:?}", - exceptions, - ); - - Ok(try_restrict(exceptions)?) - } - - // TODO: - /// Runs a check for landlock and returns a single bool indicating whether the given landlock - /// ABI is fully enabled on the current Linux environment. - pub fn check_is_fully_enabled() -> bool { - let status_from_thread: Result> = - match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())) - .join() - { - Ok(Ok(status)) => Ok(status), - Ok(Err(ruleset_err)) => Err(ruleset_err.into()), - Err(_err) => Err("a panic occurred in try_restrict".into()), - }; - - matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) - } - - /// Tries to restrict the current thread (should only be called in a process' main thread) with - /// the following landlock access controls: - /// - /// 1. all global filesystem access restricted, with optional exceptions - /// 2. ... more sandbox types (e.g. networking) may be supported in the future. - /// - /// If landlock is not supported in the current environment this is simply a noop. - /// - /// # Returns - /// - /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - fn try_restrict(fs_exceptions: I) -> Result - where - I: IntoIterator, - P: AsRef, - A: Into>, - { - let mut ruleset = - Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; - for (fs_path, access_bits) in fs_exceptions { - let paths = &[fs_path.as_ref().to_owned()]; - let mut rules = path_beneath_rules(paths, access_bits).peekable(); - if rules.peek().is_none() { - // `path_beneath_rules` silently ignores missing paths, so check for it manually. - return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned())) - } - ruleset = ruleset.add_rules(rules)?; - } - let status = ruleset.restrict_self()?; - Ok(status.ruleset) - } - - #[cfg(test)] - mod tests { - use super::*; - use std::{fs, io::ErrorKind, thread}; - - #[test] - fn restricted_thread_cannot_read_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot read from FS. - let handle = - thread::spawn(|| { - // Create, write, and read two tmp files. This should succeed before any - // landlock restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - let s = fs::read_to_string(path1).unwrap(); - assert_eq!(s, TEXT); - fs::write(path2, TEXT).unwrap(); - let s = fs::read_to_string(path2).unwrap(); - assert_eq!(s, TEXT); - - // Apply Landlock with a read exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from both files, only tmpfile1 should succeed. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Ok(s) if s == TEXT - )); - let result = fs::read_to_string(path2); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from tmpfile1 after landlock, it should fail. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - #[test] - fn restricted_thread_cannot_write_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot write to FS. - let handle = - thread::spawn(|| { - // Create and write two tmp files. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - fs::write(path2, TEXT).unwrap(); - - // Apply Landlock with a write exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to both files, only tmpfile1 should succeed. - let result = fs::write(path1, TEXT); - assert!(matches!(result, Ok(_))); - let result = fs::write(path2, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to tmpfile1 after landlock, it should fail. - let result = fs::write(path1, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - // Test that checks whether landlock under our ABI version is able to truncate files. - #[test] - fn restricted_thread_can_truncate_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread can truncate file. - let handle = - thread::spawn(|| { - // Create and write a file. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let path = tmpfile.path(); - - fs::write(path, TEXT).unwrap(); - - // Apply Landlock with all exceptions under the current ABI. - let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to truncate the file. - let result = tmpfile.as_file().set_len(0); - assert!(result.is_ok()); - }); - - assert!(handle.join().is_ok()); - } - } -} diff --git a/polkadot/node/core/pvf/common/src/worker/security/landlock.rs b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs new file mode 100644 index 000000000000..51500c733b8c --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs @@ -0,0 +1,325 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! The [landlock] docs say it best: +//! +//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict +//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security +//! sandboxes as new security layers in addition to the existing system-wide access-controls. This +//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or +//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, +//! to securely restrict themselves." +//! +//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html + +pub use landlock::RulesetStatus; + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use landlock::*; +use std::path::{Path, PathBuf}; + +/// Landlock ABI version. We use ABI V1 because: +/// +/// 1. It is supported by our reference kernel version. +/// 2. Later versions do not (yet) provide additional security that would benefit us. +/// +/// # Versions (as of October 2023) +/// +/// - Polkadot reference kernel version: 5.16+ +/// +/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. +/// +/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During +/// execution an attacker can only affect the name of a symlinked artifact and not the original +/// one. +/// +/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can +/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we +/// plan to check for file integrity anyway; see +/// . +/// +/// # Determinism +/// +/// You may wonder whether we could always use the latest ABI instead of only the ABI supported +/// by the reference kernel version. It seems plausible, since landlock provides a best-effort +/// approach to enabling sandboxing. For example, if the reference version only supported V1 and +/// we were on V2, then landlock would use V2 if it was supported on the current machine, and +/// just fall back to V1 if not. +/// +/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, +/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack +/// vector: they can exploit this indeterminism between landlock ABIs! +/// +/// On the other hand we do want validators to be as secure as possible and protect their keys +/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy +/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version +/// supports it or if it introduces some new feature that is beneficial to security. +pub const LANDLOCK_ABI: ABI = ABI::V1; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Invalid exception path: {0:?}")] + InvalidExceptionPath(PathBuf), + #[error(transparent)] + RulesetError(#[from] RulesetError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable landlock for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result { + let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { + WorkerKind::Prepare => { + vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] + }, + WorkerKind::Execute => { + vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] + }, + WorkerKind::CheckPivotRoot => + panic!("this should only be passed for checking pivot_root; qed"), + }; + + gum::trace!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling landlock with exceptions: {:?}", + exceptions, + ); + + try_restrict(exceptions) +} + +// TODO: +/// Runs a check for landlock and returns a single bool indicating whether the given landlock +/// ABI is fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result = + match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() + { + Ok(Ok(status)) => Ok(status), + Ok(Err(ruleset_err)) => Err(ruleset_err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) +} + +/// Tries to restrict the current thread (should only be called in a process' main thread) with +/// the following landlock access controls: +/// +/// 1. all global filesystem access restricted, with optional exceptions +/// 2. ... more sandbox types (e.g. networking) may be supported in the future. +/// +/// If landlock is not supported in the current environment this is simply a noop. +/// +/// # Returns +/// +/// The status of the restriction (whether it was fully, partially, or not-at-all enforced). +fn try_restrict(fs_exceptions: I) -> Result +where + I: IntoIterator, + P: AsRef, + A: Into>, +{ + let mut ruleset = + Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; + for (fs_path, access_bits) in fs_exceptions { + let paths = &[fs_path.as_ref().to_owned()]; + let mut rules = path_beneath_rules(paths, access_bits).peekable(); + if rules.peek().is_none() { + // `path_beneath_rules` silently ignores missing paths, so check for it manually. + return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned())) + } + ruleset = ruleset.add_rules(rules)?; + } + let status = ruleset.restrict_self()?; + Ok(status.ruleset) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{fs, io::ErrorKind, thread}; + + #[test] + fn restricted_thread_cannot_read_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot read from FS. + let handle = thread::spawn(|| { + // Create, write, and read two tmp files. This should succeed before any + // landlock restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + let s = fs::read_to_string(path1).unwrap(); + assert_eq!(s, TEXT); + fs::write(path2, TEXT).unwrap(); + let s = fs::read_to_string(path2).unwrap(); + assert_eq!(s, TEXT); + + // Apply Landlock with a read exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from both files, only tmpfile1 should succeed. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Ok(s) if s == TEXT + )); + let result = fs::read_to_string(path2); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from tmpfile1 after landlock, it should fail. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + #[test] + fn restricted_thread_cannot_write_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot write to FS. + let handle = thread::spawn(|| { + // Create and write two tmp files. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + fs::write(path2, TEXT).unwrap(); + + // Apply Landlock with a write exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to both files, only tmpfile1 should succeed. + let result = fs::write(path1, TEXT); + assert!(matches!(result, Ok(_))); + let result = fs::write(path2, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to tmpfile1 after landlock, it should fail. + let result = fs::write(path1, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + // Test that checks whether landlock under our ABI version is able to truncate files. + #[test] + fn restricted_thread_can_truncate_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread can truncate file. + let handle = thread::spawn(|| { + // Create and write a file. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let path = tmpfile.path(); + + fs::write(path, TEXT).unwrap(); + + // Apply Landlock with all exceptions under the current ABI. + let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to truncate the file. + let result = tmpfile.as_file().set_len(0); + assert!(result.is_ok()); + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs new file mode 100644 index 000000000000..9a38ed172773 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -0,0 +1,189 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for securing workers. +//! +//! This is needed because workers are used to compile and execute untrusted code (PVFs). +//! +//! We currently employ the following security measures: +//! +//! - Restrict filesystem +//! - Use Landlock to remove all unnecessary FS access rights. +//! - Unshare the user and mount namespaces. +//! - Change the root directory to a worker-specific temporary directory. +//! - Restrict networking by blocking socket creation and io_uring. +//! - Remove env vars + +use crate::{worker::WorkerKind, LOG_TARGET}; + +#[cfg(target_os = "linux")] +pub mod landlock; + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub mod seccomp; + +/// Unshare the user namespace and change root to be the artifact directory. +/// +/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: +/// "CLONE_NEWUSER requires that the calling process is not threaded." +#[cfg(target_os = "linux")] +pub fn unshare_user_namespace_and_change_root( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &std::path::Path, +) -> Result<(), String> { + use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; + + // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 + macro_rules! cstr_ptr { + ($e:expr) => { + concat!($e, "\0").as_ptr().cast::() + }; + } + + gum::trace!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "unsharing the user namespace and calling pivot_root", + ); + + let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) + .expect("on unix; the path will never contain 0 bytes; qed"); + + // Wrapper around all the work to prevent repetitive error handling. + // + // # Errors + // + // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does + // not give the context of which call failed, so we return a &str error. + || -> Result<(), &'static str> { + // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps + // (2) and (3) are adapted from the example in pivot_root(2), with the additional + // change described in the `pivot_root(".", ".")` section. + unsafe { + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { + return Err("unshare user and mount namespaces") + } + + // 2. Setup mounts. + // + // Ensure that new root and its parent mount don't have shared propagation (which would + // cause pivot_root() to return an error), and prevent propagation of mount events to + // the initial mount namespace. + if libc::mount( + ptr::null(), + cstr_ptr!("/"), + ptr::null(), + libc::MS_REC | libc::MS_PRIVATE, + ptr::null(), + ) < 0 + { + return Err("mount MS_PRIVATE") + } + // Ensure that the new root is a mount point. + let additional_flags = + if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { + libc::MS_RDONLY + } else { + 0 + }; + if libc::mount( + worker_dir_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), + ptr::null(), // ignored when MS_BIND is used + libc::MS_BIND | + libc::MS_REC | libc::MS_NOEXEC | + libc::MS_NODEV | libc::MS_NOSUID | + libc::MS_NOATIME | additional_flags, + ptr::null(), // ignored when MS_BIND is used + ) < 0 + { + return Err("mount MS_BIND") + } + + // 3. `pivot_root` to the artifact directory. + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 { + return Err("pivot_root") + } + if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 { + return Err("umount the old root mount point") + } + } + + Ok(()) + }() + .map_err(|err_ctx| { + let err = std::io::Error::last_os_error(); + format!("{}: {}", err_ctx, err) + })?; + + // Do some assertions. + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected current dir after pivot_root to be `/`".into()) + } + env::set_current_dir("..").map_err(|err| err.to_string())?; + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected not to be able to break out of new root by doing `..`".into()) + } + + Ok(()) +} + +/// Require env vars to have been removed when spawning the process, to prevent malicious code from +/// accessing them. +pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { + gum::trace!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "clearing env vars in worker", + ); + + let mut ok = true; + + for (key, value) in std::env::vars_os() { + // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of + // randomness for malicious code. In the future we can remove it also and log in the host; + // see . + if key == "RUST_LOG" { + continue + } + // An exception for MacOS. This is not a secure platform anyway, so we let it slide. + #[cfg(target_os = "macos")] + if key == "__CF_USER_TEXT_ENCODING" { + continue + } + + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?key, + ?value, + "env var was present that should have been removed", + ); + + ok = false; + } + + ok +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs new file mode 100644 index 000000000000..5539ad284400 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -0,0 +1,201 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for sandboxing workers by restricting their capabilities by blocking certain +//! syscalls with seccomp. +//! +//! For security we block the following: +//! +//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without +//! affecting consensus. +//! +//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the +//! safety of doing this. +//! +//! # Safety of blocking io_uring +//! +//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing +//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally, +//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs +//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these +//! reasons. +//! +//! Considering `io_uring`'s status discussed above, and that it very likely would get detected +//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by +//! testing, we think it is safe to block it. +//! +//! ## Consensus analysis +//! +//! If execution hits an edge case code path unique to a given machine, it's already taken a +//! non-deterministic branch anyway. After all, we just care that the majority of validators reach +//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can +//! always admit fault and refund the wrong validator. On the other hand, if all validators take the +//! code path that results in a seccomp violation, then they would all vote against the current +//! candidate, which is also fine. The violation would get logged (in big scary letters) and +//! hopefully some validator reports it to us. +//! +//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is +//! no consensus. But so many things would have to go wrong for that to happen: +//! +//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for +//! IO-heavy applications) +//! +//! 2. The new syscall is not detected by our static analysis +//! +//! 3. It is never triggered in any of our tests +//! +//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a +//! stall (bad but very unlikely) +//! +//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad) +//! +//! Considering how many things would have to go wrong here, we believe it's safe to block +//! `io_uring`. +//! +//! # Action on syscall violations +//! +//! On syscall violations we currently only log, to make sure this works correctly before enforcing. +//! +//! In the future, when a forbidden syscall is attempted we immediately kill the process in order to +//! prevent the attacker from doing anything else. In execution, this will result in voting against +//! the candidate. + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use seccompiler::*; +use std::{collections::BTreeMap, path::Path}; + +/// The action to take on caught syscalls. +#[cfg(not(test))] +const CAUGHT_ACTION: SeccompAction = SeccompAction::Log; +/// Don't kill the process when testing. +#[cfg(test)] +const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32); + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + Seccomp(#[from] seccompiler::Error), + #[error(transparent)] + Backend(#[from] seccompiler::BackendError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable seccomp for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result<()> { + gum::trace!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling seccomp", + ); + + try_restrict() +} + +/// Runs a check for seccomp and returns a single bool indicating whether seccomp with our rules is +/// fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result<()> = match std::thread::spawn(|| try_restrict()).join() { + Ok(Ok(())) => Ok(()), + Ok(Err(err)) => Err(err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(())) +} + +/// Applies a `seccomp` filter to disable networking for the PVF threads. +pub fn try_restrict() -> Result<()> { + // Build a `seccomp` filter which by default allows all syscalls except those blocked in the + // blacklist. + let mut blacklisted_rules = BTreeMap::default(); + + // Restrict the creation of sockets. + blacklisted_rules.insert(libc::SYS_socketpair, vec![]); + blacklisted_rules.insert(libc::SYS_socket, vec![]); + + // Prevent connecting to sockets for extra safety. + blacklisted_rules.insert(libc::SYS_connect, vec![]); + + // Restrict io_uring. + blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]); + + let filter = SeccompFilter::new( + blacklisted_rules, + // Mismatch action: what to do if not in rule list. + SeccompAction::Allow, + // Match action: what to do if in rule list. + CAUGHT_ACTION, + TargetArch::x86_64, + )?; + + let bpf_prog: BpfProgram = filter.try_into()?; + + // Applies filter (runs seccomp) to the calling thread. + seccompiler::apply_filter(&bpf_prog)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{io::ErrorKind, net::TcpListener, thread}; + + #[test] + fn sandboxed_thread_cannot_use_sockets() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + let handle = thread::spawn(|| { + // Open a socket, this should succeed before seccomp is applied. + TcpListener::bind("127.0.0.1:0").unwrap(); + + let status = try_restrict(); + if !matches!(status, Ok(())) { + panic!("Ruleset should be enforced since we checked if seccomp is enabled"); + } + + // Try to open a socket after seccomp. + assert!(matches!( + TcpListener::bind("127.0.0.1:0"), + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Other syscalls should still work. + unsafe { + assert!(libc::getppid() > 0); + } + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/execute-worker/Cargo.toml b/polkadot/node/core/pvf/execute-worker/Cargo.toml index 23678d95696e..203bbd0e7859 100644 --- a/polkadot/node/core/pvf/execute-worker/Cargo.toml +++ b/polkadot/node/core/pvf/execute-worker/Cargo.toml @@ -11,7 +11,6 @@ cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } rayon = "1.5.1" -tokio = { version = "1.24.2", features = ["fs", "process"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index af73eb16e685..8872f9bc8dd3 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -39,12 +39,12 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ + io, os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::io; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. // That native code does not create any stacks and just reuses the stack of the thread that @@ -138,7 +138,7 @@ pub fn worker_entrypoint( node_version, worker_version, &security_status, - |mut stream, worker_dir_path| async move { + |mut stream, worker_dir_path| { let worker_pid = std::process::id(); let artifact_path = worker_dir::execute_artifact(&worker_dir_path); diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index 886209b78c32..eb53ebdc941b 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -13,7 +13,6 @@ gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" rayon = "1.5.1" tikv-jemalloc-ctl = { version = "0.5.0", optional = true } -tokio = { version = "1.24.2", features = ["fs", "process"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index fa5d3656a35e..926d9cabe182 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -45,12 +45,12 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_primitives::ExecutorParams; use std::{ + fs, io, os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::io; /// Contains the bytes for a successfully compiled artifact. pub struct CompiledArtifact(Vec); @@ -131,7 +131,7 @@ pub fn worker_entrypoint( node_version, worker_version, &security_status, - |mut stream, worker_dir_path| async move { + |mut stream, worker_dir_path| { let worker_pid = std::process::id(); let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); @@ -229,8 +229,7 @@ pub fn worker_entrypoint( // Stop the memory stats worker and get its observed memory stats. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid) - .await; + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid); let memory_stats = MemoryStats { #[cfg(any( target_os = "linux", @@ -255,7 +254,7 @@ pub fn worker_entrypoint( "worker: writing artifact to {}", temp_artifact_dest.display(), ); - tokio::fs::write(&temp_artifact_dest, &artifact).await?; + fs::write(&temp_artifact_dest, &artifact)?; Ok(PrepareStats { cpu_time_elapsed, memory_stats }) }, diff --git a/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs b/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs index c70ff56fc84d..5f577b0901c2 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs @@ -122,7 +122,7 @@ pub mod memory_tracker { } /// Helper function to get the stats from the memory tracker. Helps isolate this error handling. - pub async fn get_memory_tracker_loop_stats( + pub fn get_memory_tracker_loop_stats( thread: JoinHandle>, worker_pid: u32, ) -> Option { diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 783c7c7abbc8..61264f7d517d 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -18,6 +18,7 @@ use crate::{ artifacts::ArtifactPathId, + security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -106,7 +107,7 @@ pub enum Outcome { /// returns the outcome. /// /// NOTE: Not returning the idle worker token in `Outcome` will trigger the child process being -/// killed. +/// killed, if it's still alive. pub async fn start_work( worker: IdleWorker, artifact: ArtifactPathId, @@ -124,7 +125,10 @@ pub async fn start_work( artifact.path.display(), ); + let artifact_path = artifact.path.clone(); with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move { + let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; + if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await { gum::warn!( target: LOG_TARGET, @@ -153,9 +157,38 @@ pub async fn start_work( ?error, "failed to recv an execute response", ); + // The worker died. Check if it was due to a seccomp violation. + // + // NOTE: Log, but don't change the outcome. Not all validators may have + // auditing enabled, so we don't want attackers to abuse a non-deterministic + // outcome. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + validation_code_hash = ?artifact.id.code_hash, + ?artifact_path, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + return Outcome::IoErr }, Ok(response) => { + // Check if any syscall violations occurred during the job. For now this is + // only informative, as we are not enforcing the seccomp policy yet. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + validation_code_hash = ?artifact.id.code_hash, + ?artifact_path, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + if let Response::Ok{duration, ..} = response { if duration > execution_timeout { // The job didn't complete within the timeout. diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 6c9606bb2f3c..dd0bd8581985 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -24,12 +24,12 @@ use crate::{ artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts}, execute::{self, PendingExecutionRequest}, metrics::Metrics, - prepare, Priority, ValidationError, LOG_TARGET, + prepare, security, Priority, ValidationError, LOG_TARGET, }; use always_assert::never; use futures::{ channel::{mpsc, oneshot}, - Future, FutureExt, SinkExt, StreamExt, + join, Future, FutureExt, SinkExt, StreamExt, }; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -153,6 +153,7 @@ pub struct Config { pub cache_path: PathBuf, /// The version of the node. `None` can be passed to skip the version check (only for tests). pub node_version: Option, + /// The path to the program that can be used to spawn the prepare workers. pub prepare_worker_program_path: PathBuf, /// The time allotted for a prepare worker to spawn and report to the host. @@ -162,6 +163,7 @@ pub struct Config { pub prepare_workers_soft_max_num: usize, /// The absolute number of workers that can be spawned in the prepare pool. pub prepare_workers_hard_max_num: usize, + /// The path to the program that can be used to spawn the execute workers. pub execute_worker_program_path: PathBuf, /// The time allotted for an execute worker to spawn and report to the host. @@ -181,10 +183,12 @@ impl Config { Self { cache_path, node_version, + prepare_worker_program_path, prepare_worker_spawn_timeout: Duration::from_secs(3), prepare_workers_soft_max_num: 1, prepare_workers_hard_max_num: 1, + execute_worker_program_path, execute_worker_spawn_timeout: Duration::from_secs(3), execute_workers_max_num: 2, @@ -200,15 +204,24 @@ impl Config { /// The future should not return normally but if it does then that indicates an unrecoverable error. /// In that case all pending requests will be canceled, dropping the result senders and new ones /// will be rejected. -pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { +pub async fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host"); // Run checks for supported security features once per host startup. Warn here if not enabled. let security_status = { - let can_enable_landlock = check_landlock(&config.prepare_worker_program_path); - let can_unshare_user_namespace_and_change_root = - check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path); - SecurityStatus { can_enable_landlock, can_unshare_user_namespace_and_change_root } + // TODO: add check that syslog is available and that seccomp violations are logged? + let (can_enable_landlock, can_enable_seccomp, can_unshare_user_namespace_and_change_root) = join!( + security::check_landlock(&config.prepare_worker_program_path), + security::check_seccomp(&config.prepare_worker_program_path), + security::check_can_unshare_user_namespace_and_change_root( + &config.prepare_worker_program_path + ) + ); + SecurityStatus { + can_enable_landlock, + can_enable_seccomp, + can_unshare_user_namespace_and_change_root, + } }; let (to_host_tx, to_host_rx) = mpsc::channel(10); @@ -882,105 +895,6 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream .map(|_| ()) } -/// Check if we can sandbox the root and emit a warning if not. -/// -/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible -/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on -/// success and -1 on failure. -fn check_can_unshare_user_namespace_and_change_root( - #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] - prepare_worker_program_path: &Path, -) -> bool { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - let output = std::process::Command::new(prepare_worker_program_path) - .arg("--check-can-unshare-user-namespace-and-change-root") - .output(); - - match output { - Ok(output) if output.status.success() => true, - Ok(output) => { - let stderr = std::str::from_utf8(&output.stderr) - .expect("child process writes a UTF-8 string to stderr; qed") - .trim(); - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - // Docs say to always print status using `Display` implementation. - status = %output.status, - %stderr, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, - } - } else { - gum::warn!( - target: LOG_TARGET, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." - ); - false - } - } -} - -/// Check if landlock is supported and emit a warning if not. -/// -/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible -/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on -/// success and -1 on failure. -fn check_landlock( - #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] - prepare_worker_program_path: &Path, -) -> bool { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - match std::process::Command::new(prepare_worker_program_path) - .arg("--check-can-enable-landlock") - .status() - { - Ok(status) if status.success() => true, - Ok(status) => { - let abi = - polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - ?status, - %abi, - "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, - } - } else { - gum::warn!( - target: LOG_TARGET, - "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." - ); - false - } - } -} - #[cfg(test)] pub(crate) mod tests { use super::*; diff --git a/polkadot/node/core/pvf/src/lib.rs b/polkadot/node/core/pvf/src/lib.rs index 27630af40c2f..102a91dbdad7 100644 --- a/polkadot/node/core/pvf/src/lib.rs +++ b/polkadot/node/core/pvf/src/lib.rs @@ -97,6 +97,7 @@ mod host; mod metrics; mod prepare; mod priority; +mod security; mod worker_intf; #[cfg(feature = "test-utils")] diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index b66c36044343..1f6ab0b76556 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -18,6 +18,7 @@ use crate::{ metrics::Metrics, + security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -126,7 +127,9 @@ pub async fn start_work( pid, |tmp_artifact_file, mut stream, worker_dir| async move { let preparation_timeout = pvf.prep_timeout(); - if let Err(err) = send_request(&mut stream, pvf).await { + let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; + + if let Err(err) = send_request(&mut stream, pvf.clone()).await { gum::warn!( target: LOG_TARGET, worker_pid = %pid, @@ -150,7 +153,19 @@ pub async fn start_work( match result { // Received bytes from worker within the time limit. - Ok(Ok(prepare_result)) => + Ok(Ok(prepare_result)) => { + // Check if any syscall violations occurred during the job. For now this is only + // informative, as we are not enforcing the seccomp policy yet. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + ?pvf, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + handle_response( metrics, IdleWorker { stream, pid, worker_dir }, @@ -160,7 +175,8 @@ pub async fn start_work( artifact_path, preparation_timeout, ) - .await, + .await + }, Ok(Err(err)) => { // Communication error within the time limit. gum::warn!( @@ -169,6 +185,21 @@ pub async fn start_work( "failed to recv a prepare response: {:?}", err, ); + + // The worker died. Check if it was due to a seccomp violation. + // + // NOTE: Log, but don't change the outcome. Not all validators may have auditing + // enabled, so we don't want attackers to abuse a non-deterministic outcome. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + ?pvf, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + Outcome::IoErr(err.to_string()) }, Err(_) => { diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs new file mode 100644 index 000000000000..decd321e415e --- /dev/null +++ b/polkadot/node/core/pvf/src/security.rs @@ -0,0 +1,312 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::LOG_TARGET; +use std::path::Path; +use tokio::{ + fs::{File, OpenOptions}, + io::{AsyncReadExt, AsyncSeekExt, SeekFrom}, +}; + +/// Check if we can sandbox the root and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_can_unshare_user_namespace_and_change_root( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-unshare-user-namespace-and-change-root") + .output() + .await + { + Ok(output) if output.status.success() => true, + Ok(output) => { + let stderr = std::str::from_utf8(&output.stderr) + .expect("child process writes a UTF-8 string to stderr; qed") + .trim(); + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + // Docs say to always print status using `Display` implementation. + status = %output.status, + %stderr, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." + ); + false + } + } +} + +/// Check if landlock is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_landlock( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-landlock") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + let abi = + polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + %abi, + "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + ); + false + } + } +} + +/// Check if seccomp is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_seccomp( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-seccomp") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + "Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security." + ); + false + } + } +} + +const AUDIT_LOG_PATH: &'static str = "/var/log/audit/audit.log"; +const SYSLOG_PATH: &'static str = "/var/log/syslog"; + +/// System audit log. +pub struct AuditLogFile { + file: File, + path: &'static str, +} + +impl AuditLogFile { + /// Looks for an audit log file on the system and opens it, seeking to the end to skip any + /// events from before this was called. + /// + /// A bit of a verbose name, but it should clue future refactorers not to move calls closer to + /// where the `AuditLogFile` is used. + pub async fn try_open_and_seek_to_end() -> Option { + let mut path = AUDIT_LOG_PATH; + let mut file = match OpenOptions::new().read(true).open(AUDIT_LOG_PATH).await { + Ok(file) => Ok(file), + Err(_) => { + path = SYSLOG_PATH; + OpenOptions::new().read(true).open(SYSLOG_PATH).await + }, + } + .ok()?; + + let _pos = file.seek(SeekFrom::End(0)).await; + + Some(Self { file, path }) + } + + async fn read_new_since_open(mut self) -> String { + let mut buf = String::new(); + let _len = self.file.read_to_string(&mut buf).await; + buf + } +} + +/// Check if a seccomp violation occurred for the given worker. As the syslog may be in a different +/// location, or seccomp auditing may be disabled, this function provides a best-effort attempt +/// only. +/// +/// The `audit_log_file` must have been obtained before the job started. It only allows reading +/// entries that were written since it was obtained, so that we do not consider events from previous +/// processes with the same pid. This can still be racy, but it's unlikely and fine for a +/// best-effort attempt. +pub async fn check_seccomp_violations_for_worker( + audit_log_file: Option, + worker_pid: u32, +) -> Vec { + let audit_event_pid_field = format!("pid={worker_pid}"); + + let audit_log_file = match audit_log_file { + Some(file) => { + gum::debug!( + target: LOG_TARGET, + %worker_pid, + audit_log_path = ?file.path, + "checking audit log for seccomp violations", + ); + file + }, + None => { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs" + ); + return vec![] + }, + }; + let events = audit_log_file.read_new_since_open().await; + + let mut violations = vec![]; + for event in events.lines() { + if let Some(syscall) = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field) { + violations.push(syscall); + } + } + + violations +} + +fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option { + const SECCOMP_AUDIT_EVENT_TYPE: &'static str = "type=1326"; + + // Do a series of simple .contains instead of a regex, because I'm not sure if the fields are + // guaranteed to always be in the same order. + if !event.contains(SECCOMP_AUDIT_EVENT_TYPE) || !event.contains(&audit_event_pid_field) { + return None + } + + // Get the syscall. Let's avoid a dependency on regex just for this. + for field in event.split(" ") { + if let Some(syscall) = field.strip_prefix("syscall=") { + return syscall.parse::().ok() + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_audit_log_for_seccomp_event() { + let audit_event_pid_field = "pid=2559058"; + + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559058 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + Some(53) + ); + // pid is wrong + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + // type is wrong + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + // no syscall field + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + } +} diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index e9382b66bf75..8f9a7de354b8 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -245,7 +245,7 @@ pub enum SpawnErr { /// has been terminated. Since the worker is running in another process it is obviously not /// necessary to poll this future to make the worker run, it's only for termination detection. /// -/// This future relies on the fact that a child process's stdout `fd` is closed upon it's +/// This future relies on the fact that a child process's stdout `fd` is closed upon its /// termination. #[pin_project] pub struct WorkerHandle { @@ -270,6 +270,9 @@ impl WorkerHandle { if security_status.can_enable_landlock { args.push("--can-enable-landlock".to_string()); } + if security_status.can_enable_seccomp { + args.push("--can-enable-seccomp".to_string()); + } if security_status.can_unshare_user_namespace_and_change_root { args.push("--can-unshare-user-namespace-and-change-root".to_string()); } diff --git a/polkadot/node/core/pvf/tests/it/adder.rs b/polkadot/node/core/pvf/tests/it/adder.rs index 8bdd09db208a..e8d8a9a6b63e 100644 --- a/polkadot/node/core/pvf/tests/it/adder.rs +++ b/polkadot/node/core/pvf/tests/it/adder.rs @@ -28,7 +28,7 @@ async fn execute_good_block_on_parent() { let block_data = BlockData { state: 0, add: 512 }; - let host = TestHost::new(); + let host = TestHost::new().await; let ret = host .validate_candidate( @@ -56,7 +56,7 @@ async fn execute_good_chain_on_parent() { let mut parent_hash = [0; 32]; let mut last_state = 0; - let host = TestHost::new(); + let host = TestHost::new().await; for (number, add) in (0..10).enumerate() { let parent_head = @@ -98,7 +98,7 @@ async fn execute_bad_block_on_parent() { add: 256, }; - let host = TestHost::new(); + let host = TestHost::new().await; let _err = host .validate_candidate( @@ -117,7 +117,7 @@ async fn execute_bad_block_on_parent() { #[tokio::test] async fn stress_spawn() { - let host = std::sync::Arc::new(TestHost::new()); + let host = std::sync::Arc::new(TestHost::new().await); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; @@ -149,9 +149,12 @@ async fn stress_spawn() { // With one worker, run multiple execution jobs serially. They should not conflict. #[tokio::test] async fn execute_can_run_serially() { - let host = std::sync::Arc::new(TestHost::new_with_config(|cfg| { - cfg.execute_workers_max_num = 1; - })); + let host = std::sync::Arc::new( + TestHost::new_with_config(|cfg| { + cfg.execute_workers_max_num = 1; + }) + .await, + ); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index cdf8d6eb82d2..a69a488adb98 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -14,7 +14,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -#[cfg(feature = "ci-only-tests")] use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ @@ -24,6 +23,8 @@ use polkadot_node_core_pvf::{ }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::ExecutorParams; +#[cfg(target_os = "linux")] +use rusty_fork::rusty_fork_test; #[cfg(feature = "ci-only-tests")] use polkadot_primitives::ExecutorParam; @@ -43,11 +44,11 @@ struct TestHost { } impl TestHost { - fn new() -> Self { - Self::new_with_config(|_| ()) + async fn new() -> Self { + Self::new_with_config(|_| ()).await } - fn new_with_config(f: F) -> Self + async fn new_with_config(f: F) -> Self where F: FnOnce(&mut Config), { @@ -61,7 +62,7 @@ impl TestHost { execute_worker_path, ); f(&mut config); - let (host, task) = start(config, Metrics::default()); + let (host, task) = start(config, Metrics::default()).await; let _ = tokio::task::spawn(task); Self { cache_dir, host: Mutex::new(host) } } @@ -127,7 +128,7 @@ impl TestHost { #[tokio::test] async fn terminates_on_timeout() { - let host = TestHost::new(); + let host = TestHost::new().await; let start = std::time::Instant::now(); let result = host @@ -153,11 +154,113 @@ async fn terminates_on_timeout() { assert!(duration < TEST_EXECUTION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR); } +#[cfg(target_os = "linux")] +fn kill_by_sid_and_name(sid: i32, exe_name: &'static str) { + use procfs::process; + + let all_processes: Vec = process::all_processes() + .expect("Can't read /proc") + .filter_map(|p| match p { + Ok(p) => Some(p), // happy path + Err(e) => match e { + // process vanished during iteration, ignore it + procfs::ProcError::NotFound(_) => None, + x => { + panic!("some unknown error: {}", x); + }, + }, + }) + .collect(); + + for process in all_processes { + if process.stat().unwrap().session == sid && + process.exe().unwrap().to_str().unwrap().contains(exe_name) + { + assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); + } + } +} + +// Run these tests in their own processes with rusty-fork. They work by each creating a new session, +// then killing the worker process that matches the session ID and expected worker name. +#[cfg(target_os = "linux")] +rusty_fork_test! { + // What happens when the prepare worker dies in the middle of a job? + #[test] + fn prepare_worker_killed_during_job() { + const PROCESS_NAME: &'static str = "polkadot-prepare-worker"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, PROCESS_NAME); + } + ); + + assert_matches!(result, Err(PrepareError::IoErr(_))); + }) + } + + // What happens when the execute worker dies in the middle of a job? + #[test] + fn execute_worker_killed_during_job() { + const PROCESS_NAME: &'static str = "polkadot-execute-worker"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose an job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, PROCESS_NAME); + } + ); + + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) + ); + }) + } +} + #[cfg(feature = "ci-only-tests")] #[tokio::test] async fn ensure_parallel_execution() { // Run some jobs that do not complete, thus timing out. - let host = TestHost::new(); + let host = TestHost::new().await; let execute_pvf_future_1 = host.validate_candidate( halt::wasm_binary_unwrap(), ValidationParams { @@ -204,7 +307,8 @@ async fn ensure_parallel_execution() { async fn execute_queue_doesnt_stall_if_workers_died() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 5; - }); + }) + .await; // Here we spawn 8 validation jobs for the `halt` PVF and share those between 5 workers. The // first five jobs should timeout and the workers killed. For the next 3 jobs a new batch of @@ -241,7 +345,8 @@ async fn execute_queue_doesnt_stall_if_workers_died() { async fn execute_queue_doesnt_stall_with_varying_executor_params() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 2; - }); + }) + .await; let executor_params_1 = ExecutorParams::default(); let executor_params_2 = ExecutorParams::from(&[ExecutorParam::StackLogicalMax(1024)][..]); @@ -289,7 +394,7 @@ async fn execute_queue_doesnt_stall_with_varying_executor_params() { // Test that deleting a prepared artifact does not lead to a dispute when we try to execute it. #[tokio::test] async fn deleting_prepared_artifact_does_not_dispute() { - let host = TestHost::new(); + let host = TestHost::new().await; let cache_dir = host.cache_dir.path(); let _stats = host.precheck_pvf(halt::wasm_binary_unwrap(), Default::default()).await.unwrap(); @@ -334,7 +439,8 @@ async fn deleting_prepared_artifact_does_not_dispute() { async fn prepare_can_run_serially() { let host = TestHost::new_with_config(|cfg| { cfg.prepare_workers_hard_max_num = 1; - }); + }) + .await; let _stats = host .precheck_pvf(::adder::wasm_binary_unwrap(), Default::default()) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 6a14a3a013d4..0cefeb1f77ca 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -126,6 +126,19 @@ with untrusted code does not have unnecessary access to the file-system. This provides some protection against attackers accessing sensitive data or modifying data on the host machine. +*Currently this is only supported on Linux.* + + + + + + + + + + + + ### Clearing env vars We clear environment variables before handling untrusted code, because why give diff --git a/polkadot/scripts/list-syscalls/execute-worker-syscalls b/polkadot/scripts/list-syscalls/execute-worker-syscalls index 05abe9ba7368..4a7a66181299 100644 --- a/polkadot/scripts/list-syscalls/execute-worker-syscalls +++ b/polkadot/scripts/list-syscalls/execute-worker-syscalls @@ -24,10 +24,8 @@ 42 (connect) 45 (recvfrom) 46 (sendmsg) -53 (socketpair) 56 (clone) 60 (exit) -61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -52,23 +50,16 @@ 200 (tkill) 202 (futex) 204 (sched_getaffinity) -213 (epoll_create) 217 (getdents64) 218 (set_tid_address) 228 (clock_gettime) 230 (clock_nanosleep) 231 (exit_group) -232 (epoll_wait) -233 (epoll_ctl) 257 (openat) 262 (newfstatat) 263 (unlinkat) 272 (unshare) 273 (set_robust_list) -281 (epoll_pwait) -284 (eventfd) -290 (eventfd2) -291 (epoll_create1) 302 (prlimit64) 318 (getrandom) 319 (memfd_create) diff --git a/polkadot/scripts/list-syscalls/prepare-worker-syscalls b/polkadot/scripts/list-syscalls/prepare-worker-syscalls index f1597f206756..cab58e06692b 100644 --- a/polkadot/scripts/list-syscalls/prepare-worker-syscalls +++ b/polkadot/scripts/list-syscalls/prepare-worker-syscalls @@ -24,10 +24,8 @@ 42 (connect) 45 (recvfrom) 46 (sendmsg) -53 (socketpair) 56 (clone) 60 (exit) -61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -54,23 +52,16 @@ 202 (futex) 203 (sched_setaffinity) 204 (sched_getaffinity) -213 (epoll_create) 217 (getdents64) 218 (set_tid_address) 228 (clock_gettime) 230 (clock_nanosleep) 231 (exit_group) -232 (epoll_wait) -233 (epoll_ctl) 257 (openat) 262 (newfstatat) 263 (unlinkat) 272 (unshare) 273 (set_robust_list) -281 (epoll_pwait) -284 (eventfd) -290 (eventfd2) -291 (epoll_create1) 302 (prlimit64) 309 (getcpu) 318 (getrandom)