From 5e100f67b2c2e8ba0fa4481212365554c0208ecc Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 24 Oct 2023 16:04:02 +0200 Subject: [PATCH 01/12] PVF worker: Add seccomp restrictions (block networking) We're already working on sandboxing by [blocking all unneeded syscalls](https://github.com/paritytech/polkadot-sdk/issues/882). However, due to the wide scope it will take a while longer. This PR starts with a much smaller scope, only blocking network-related syscalls until the above is ready. For security we block the following with `seccomp`: - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without affecting consensus. - `io_uring` - as discussed [here](https://github.com/paritytech/polkadot/pull/7334#discussion_r1281611134), io_uring allows for networking and needs to be blocked. See below for a discussion on the safety of doing this. - `connect`ing to sockets - the above two points are enough for networking and is what birdcage does (or [used to do](https://github.com/phylum-dev/birdcage/pull/47)) to restrict networking. However, it is possible to [connect to abstract unix sockets](https://lore.kernel.org/landlock/20231023.ahphah4Wii4v@digikod.net/T/#u) to do some kinds of sandbox escapes, so we also block the `connect` syscall. (Intentionally left out of implementer's guide because it felt like too much detail.) `io_uring` is just a way of issuing system calls in an async manner, and there is nothing stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally, not many applications use `io_uring` in production yet, because of the numerous kernel CVEs discovered. It's still under a lot of development. Android outright banned `io_uring` for these reasons. Considering `io_uring`'s status, and that it very likely would get detected either by our [recently-added static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by testing, I think it is fairly safe to block it. If execution hits an edge case code path unique to a given machine, it's already taken a non-deterministic branch anyway. After all, we just care that the majority of validators reach the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can always admit fault and refund the wrong validator. On the other hand, if all validators take the code path that results in a seccomp violation, then they would all vote against the current candidate, which is also fine. The violation would get logged (in big scary letters) and hopefully some validator reports it to us. Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is no consensus. But so many things would have to go wrong for that to happen: 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for IO-heavy applications) 2. The new syscall is not detected by our static analysis 3. It is never triggered in any of our tests 4. It then gets triggered on some super edge case in production on 50% of validators causing a stall (bad but very unlikely) 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad?) Considering how many things would have to go wrong here, we believe it's safe to block `io_uring`. Closes https://github.com/paritytech/polkadot-sdk/issues/619 Original PR in Polkadot repo: https://github.com/paritytech/polkadot/pull/7334 --- Cargo.lock | 11 + .../node/core/candidate-validation/src/lib.rs | 3 +- polkadot/node/core/pvf/common/Cargo.toml | 2 + polkadot/node/core/pvf/common/src/lib.rs | 2 + .../node/core/pvf/common/src/worker/mod.rs | 30 +- .../core/pvf/common/src/worker/security.rs | 512 ------------------ .../common/src/worker/security/landlock.rs | 327 +++++++++++ .../pvf/common/src/worker/security/mod.rs | 206 +++++++ .../pvf/common/src/worker/security/seccomp.rs | 197 +++++++ polkadot/node/core/pvf/src/host.rs | 79 ++- polkadot/node/core/pvf/src/worker_intf.rs | 3 + polkadot/node/core/pvf/tests/it/adder.rs | 17 +- polkadot/node/core/pvf/tests/it/main.rs | 20 +- .../src/node/utility/pvf-host-and-workers.md | 11 + 14 files changed, 875 insertions(+), 545 deletions(-) delete mode 100644 polkadot/node/core/pvf/common/src/worker/security.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/landlock.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/mod.rs create mode 100644 polkadot/node/core/pvf/common/src/worker/security/seccomp.rs diff --git a/Cargo.lock b/Cargo.lock index a8d679c6ce8b..e57e0265457c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12251,11 +12251,13 @@ dependencies = [ "sc-executor", "sc-executor-common", "sc-executor-wasmtime", + "seccompiler", "sp-core", "sp-externalities", "sp-io", "sp-tracing", "tempfile", + "thiserror", "tokio", "tracing-gum", ] @@ -16100,6 +16102,15 @@ dependencies = [ "zeroize", ] +[[package]] +name = "seccompiler" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6f6575e3c2b3a0fe2ef3e53855b6a8dead7c29f783da5e123d378c8c6a89017e" +dependencies = [ + "libc", +] + [[package]] name = "secp256k1" version = "0.24.3" diff --git a/polkadot/node/core/candidate-validation/src/lib.rs b/polkadot/node/core/candidate-validation/src/lib.rs index 21a7121d47bd..93db7d11cee8 100644 --- a/polkadot/node/core/candidate-validation/src/lib.rs +++ b/polkadot/node/core/candidate-validation/src/lib.rs @@ -149,7 +149,8 @@ async fn run( exec_worker_path, ), pvf_metrics, - ); + ) + .await; ctx.spawn_blocking("pvf-validation-host", task.boxed())?; loop { diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 4bdacca72f42..e4ca20ffdb29 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -30,6 +30,8 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.3.0" +seccompiler = "0.3.0" +thiserror = "1.0.31" [dev-dependencies] assert_matches = "1.4.0" diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 53c287ea9709..813dccadf055 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -50,6 +50,8 @@ pub mod tests { pub struct SecurityStatus { /// Whether the landlock features we use are fully available on this system. pub can_enable_landlock: bool, + /// Whether the seccomp features we use are fully available on this system. + pub can_enable_seccomp: bool, // Whether we are able to unshare the user namespace and change the filesystem root. pub can_unshare_user_namespace_and_change_root: bool, } diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index d0bd5b6bd7c5..d1e17bac24ff 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -75,6 +75,13 @@ macro_rules! decl_worker_main { let status = -1; std::process::exit(status) }, + "--check-can-enable-seccomp" => { + #[cfg(target_os = "linux")] + let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 }; + #[cfg(not(target_os = "linux"))] + let status = -1; + std::process::exit(status) + }, "--check-can-unshare-user-namespace-and-change-root" => { #[cfg(target_os = "linux")] let status = if let Err(err) = security::unshare_user_namespace_and_change_root( @@ -119,6 +126,7 @@ macro_rules! decl_worker_main { let mut worker_dir_path = None; let mut node_version = None; let mut can_enable_landlock = false; + let mut can_enable_seccomp = false; let mut can_unshare_user_namespace_and_change_root = false; let mut i = 2; @@ -137,6 +145,7 @@ macro_rules! decl_worker_main { i += 1 }, "--can-enable-landlock" => can_enable_landlock = true, + "--can-enable-seccomp" => can_enable_seccomp = true, "--can-unshare-user-namespace-and-change-root" => can_unshare_user_namespace_and_change_root = true, arg => panic!("Unexpected argument found: {}", arg), @@ -151,6 +160,7 @@ macro_rules! decl_worker_main { let worker_dir_path = std::path::Path::new(worker_dir_path).to_owned(); let security_status = $crate::SecurityStatus { can_enable_landlock, + can_enable_seccomp, can_unshare_user_namespace_and_change_root, }; @@ -307,6 +317,22 @@ pub fn worker_event_loop( let landlock_status = security::landlock::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); if !matches!(landlock_status, Ok(landlock::RulesetStatus::FullyEnforced)) { + // We previously were able to enable, so this should never happen. + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", + landlock_status + ); + } + } + + #[cfg(target_os = "linux")] + if security_status.can_enable_seccomp { + let seccomp_status = + security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); + if !matches!(seccomp_status, Ok(())) { // We previously were able to enable, so this should never happen. // // TODO: Make this a real error in secure-mode. See: @@ -315,8 +341,8 @@ pub fn worker_event_loop( target: LOG_TARGET, %worker_kind, %worker_pid, - "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", - landlock_status + "could not fully enable seccomp: {:?}. This should not happen, please report to the Polkadot devs", + seccomp_status ); } } diff --git a/polkadot/node/core/pvf/common/src/worker/security.rs b/polkadot/node/core/pvf/common/src/worker/security.rs deleted file mode 100644 index 1b7614177448..000000000000 --- a/polkadot/node/core/pvf/common/src/worker/security.rs +++ /dev/null @@ -1,512 +0,0 @@ -// Copyright (C) Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Functionality for securing workers. -//! -//! This is needed because workers are used to compile and execute untrusted code (PVFs). -//! -//! We currently employ the following security measures: -//! -//! - Restrict filesystem -//! - Use Landlock to remove all unnecessary FS access rights. -//! - Unshare the user and mount namespaces. -//! - Change the root directory to a worker-specific temporary directory. -//! - Remove env vars - -use crate::{worker::WorkerKind, LOG_TARGET}; - -/// Unshare the user namespace and change root to be the artifact directory. -/// -/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: -/// "CLONE_NEWUSER requires that the calling process is not threaded." -#[cfg(target_os = "linux")] -pub fn unshare_user_namespace_and_change_root( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &std::path::Path, -) -> Result<(), String> { - use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; - - // The following was copied from the `cstr_core` crate. - // - // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 - #[inline] - #[doc(hidden)] - const fn cstr_is_valid(bytes: &[u8]) -> bool { - if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { - return false - } - - let mut index = 0; - while index < bytes.len() - 1 { - if bytes[index] == 0 { - return false - } - index += 1; - } - true - } - - macro_rules! cstr { - ($e:expr) => {{ - const STR: &[u8] = concat!($e, "\0").as_bytes(); - const STR_VALID: bool = cstr_is_valid(STR); - let _ = [(); 0 - (!(STR_VALID) as usize)]; - #[allow(unused_unsafe)] - unsafe { - core::ffi::CStr::from_bytes_with_nul_unchecked(STR) - } - }} - } - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "unsharing the user namespace and calling pivot_root", - ); - - let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) - .expect("on unix; the path will never contain 0 bytes; qed"); - - // Wrapper around all the work to prevent repetitive error handling. - // - // # Errors - // - // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does - // not give the context of which call failed, so we return a &str error. - || -> Result<(), &'static str> { - // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps - // (2) and (3) are adapted from the example in pivot_root(2), with the additional - // change described in the `pivot_root(".", ".")` section. - unsafe { - // 1. `unshare` the user and the mount namespaces. - if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { - return Err("unshare user and mount namespaces") - } - - // 2. Setup mounts. - // - // Ensure that new root and its parent mount don't have shared propagation (which would - // cause pivot_root() to return an error), and prevent propagation of mount events to - // the initial mount namespace. - if libc::mount( - ptr::null(), - cstr!("/").as_ptr(), - ptr::null(), - libc::MS_REC | libc::MS_PRIVATE, - ptr::null(), - ) < 0 - { - return Err("mount MS_PRIVATE") - } - // Ensure that the new root is a mount point. - let additional_flags = - if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { - libc::MS_RDONLY - } else { - 0 - }; - if libc::mount( - worker_dir_path_c.as_ptr(), - worker_dir_path_c.as_ptr(), - ptr::null(), // ignored when MS_BIND is used - libc::MS_BIND | - libc::MS_REC | libc::MS_NOEXEC | - libc::MS_NODEV | libc::MS_NOSUID | - libc::MS_NOATIME | additional_flags, - ptr::null(), // ignored when MS_BIND is used - ) < 0 - { - return Err("mount MS_BIND") - } - - // 3. `pivot_root` to the artifact directory. - if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { - return Err("chdir to worker dir path") - } - if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { - return Err("pivot_root") - } - if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { - return Err("umount the old root mount point") - } - } - - Ok(()) - }() - .map_err(|err_ctx| { - let err = std::io::Error::last_os_error(); - format!("{}: {}", err_ctx, err) - })?; - - // Do some assertions. - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected current dir after pivot_root to be `/`".into()) - } - env::set_current_dir("..").map_err(|err| err.to_string())?; - if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { - return Err("expected not to be able to break out of new root by doing `..`".into()) - } - - Ok(()) -} - -/// Require env vars to have been removed when spawning the process, to prevent malicious code from -/// accessing them. -pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { - let mut ok = true; - - for (key, value) in std::env::vars_os() { - // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of - // randomness for malicious code. In the future we can remove it also and log in the host; - // see . - if key == "RUST_LOG" { - continue - } - // An exception for MacOS. This is not a secure platform anyway, so we let it slide. - #[cfg(target_os = "macos")] - if key == "__CF_USER_TEXT_ENCODING" { - continue - } - - gum::error!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?key, - ?value, - "env var was present that should have been removed", - ); - - ok = false; - } - - ok -} - -/// The [landlock] docs say it best: -/// -/// > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict -/// ambient rights (e.g., global filesystem access) for a set of processes by creating safe security -/// sandboxes as new security layers in addition to the existing system-wide access-controls. This -/// kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or -/// malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, -/// to securely restrict themselves." -/// -/// [landlock]: https://docs.rs/landlock/latest/landlock/index.html -#[cfg(target_os = "linux")] -pub mod landlock { - pub use landlock::RulesetStatus; - - use crate::{worker::WorkerKind, LOG_TARGET}; - use landlock::*; - use std::{ - fmt, - path::{Path, PathBuf}, - }; - - /// Landlock ABI version. We use ABI V1 because: - /// - /// 1. It is supported by our reference kernel version. - /// 2. Later versions do not (yet) provide additional security that would benefit us. - /// - /// # Versions (as of October 2023) - /// - /// - Polkadot reference kernel version: 5.16+ - /// - /// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. - /// - /// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During - /// execution an attacker can only affect the name of a symlinked artifact and not the - /// original one. - /// - /// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can - /// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we - /// plan to check for file integrity anyway; see - /// . - /// - /// # Determinism - /// - /// You may wonder whether we could always use the latest ABI instead of only the ABI supported - /// by the reference kernel version. It seems plausible, since landlock provides a best-effort - /// approach to enabling sandboxing. For example, if the reference version only supported V1 and - /// we were on V2, then landlock would use V2 if it was supported on the current machine, and - /// just fall back to V1 if not. - /// - /// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, - /// they may have different semantics on some PVFs. So a malicious PVF now has a new attack - /// vector: they can exploit this indeterminism between landlock ABIs! - /// - /// On the other hand we do want validators to be as secure as possible and protect their keys - /// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy - /// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version - /// supports it or if it introduces some new feature that is beneficial to security. - pub const LANDLOCK_ABI: ABI = ABI::V1; - - #[derive(Debug)] - pub enum TryRestrictError { - InvalidExceptionPath(PathBuf), - RulesetError(RulesetError), - } - - impl From for TryRestrictError { - fn from(err: RulesetError) -> Self { - Self::RulesetError(err) - } - } - - impl fmt::Display for TryRestrictError { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - match self { - Self::InvalidExceptionPath(path) => write!(f, "invalid exception path: {:?}", path), - Self::RulesetError(err) => write!(f, "ruleset error: {}", err.to_string()), - } - } - } - - impl std::error::Error for TryRestrictError {} - - /// Try to enable landlock for the given kind of worker. - pub fn enable_for_worker( - worker_kind: WorkerKind, - worker_pid: u32, - worker_dir_path: &Path, - ) -> Result> { - let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { - WorkerKind::Prepare => { - vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] - }, - WorkerKind::Execute => { - vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] - }, - WorkerKind::CheckPivotRoot => - panic!("this should only be passed for checking pivot_root; qed"), - }; - - gum::debug!( - target: LOG_TARGET, - %worker_kind, - %worker_pid, - ?worker_dir_path, - "enabling landlock with exceptions: {:?}", - exceptions, - ); - - Ok(try_restrict(exceptions)?) - } - - // TODO: - /// Runs a check for landlock and returns a single bool indicating whether the given landlock - /// ABI is fully enabled on the current Linux environment. - pub fn check_is_fully_enabled() -> bool { - let status_from_thread: Result> = - match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())) - .join() - { - Ok(Ok(status)) => Ok(status), - Ok(Err(ruleset_err)) => Err(ruleset_err.into()), - Err(_err) => Err("a panic occurred in try_restrict".into()), - }; - - matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) - } - - /// Tries to restrict the current thread (should only be called in a process' main thread) with - /// the following landlock access controls: - /// - /// 1. all global filesystem access restricted, with optional exceptions - /// 2. ... more sandbox types (e.g. networking) may be supported in the future. - /// - /// If landlock is not supported in the current environment this is simply a noop. - /// - /// # Returns - /// - /// The status of the restriction (whether it was fully, partially, or not-at-all enforced). - fn try_restrict(fs_exceptions: I) -> Result - where - I: IntoIterator, - P: AsRef, - A: Into>, - { - let mut ruleset = - Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; - for (fs_path, access_bits) in fs_exceptions { - let paths = &[fs_path.as_ref().to_owned()]; - let mut rules = path_beneath_rules(paths, access_bits).peekable(); - if rules.peek().is_none() { - // `path_beneath_rules` silently ignores missing paths, so check for it manually. - return Err(TryRestrictError::InvalidExceptionPath(fs_path.as_ref().to_owned())) - } - ruleset = ruleset.add_rules(rules)?; - } - let status = ruleset.restrict_self()?; - Ok(status.ruleset) - } - - #[cfg(test)] - mod tests { - use super::*; - use std::{fs, io::ErrorKind, thread}; - - #[test] - fn restricted_thread_cannot_read_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot read from FS. - let handle = - thread::spawn(|| { - // Create, write, and read two tmp files. This should succeed before any - // landlock restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - let s = fs::read_to_string(path1).unwrap(); - assert_eq!(s, TEXT); - fs::write(path2, TEXT).unwrap(); - let s = fs::read_to_string(path2).unwrap(); - assert_eq!(s, TEXT); - - // Apply Landlock with a read exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from both files, only tmpfile1 should succeed. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Ok(s) if s == TEXT - )); - let result = fs::read_to_string(path2); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to read from tmpfile1 after landlock, it should fail. - let result = fs::read_to_string(path1); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - #[test] - fn restricted_thread_cannot_write_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread cannot write to FS. - let handle = - thread::spawn(|| { - // Create and write two tmp files. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); - let path1 = tmpfile1.path(); - let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); - let path2 = tmpfile2.path(); - - fs::write(path1, TEXT).unwrap(); - fs::write(path2, TEXT).unwrap(); - - // Apply Landlock with a write exception for only one of the files. - let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to both files, only tmpfile1 should succeed. - let result = fs::write(path1, TEXT); - assert!(matches!(result, Ok(_))); - let result = fs::write(path2, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - - // Apply Landlock for all files. - let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to write to tmpfile1 after landlock, it should fail. - let result = fs::write(path1, TEXT); - assert!(matches!( - result, - Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) - )); - }); - - assert!(handle.join().is_ok()); - } - - // Test that checks whether landlock under our ABI version is able to truncate files. - #[test] - fn restricted_thread_can_truncate_file() { - // TODO: This would be nice: . - if !check_is_fully_enabled() { - return - } - - // Restricted thread can truncate file. - let handle = - thread::spawn(|| { - // Create and write a file. This should succeed before any landlock - // restrictions are applied. - const TEXT: &str = "foo"; - let tmpfile = tempfile::NamedTempFile::new().unwrap(); - let path = tmpfile.path(); - - fs::write(path, TEXT).unwrap(); - - // Apply Landlock with all exceptions under the current ABI. - let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); - if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { - panic!("Ruleset should be enforced since we checked if landlock is enabled: {:?}", status); - } - - // Try to truncate the file. - let result = tmpfile.as_file().set_len(0); - assert!(result.is_ok()); - }); - - assert!(handle.join().is_ok()); - } - } -} diff --git a/polkadot/node/core/pvf/common/src/worker/security/landlock.rs b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs new file mode 100644 index 000000000000..5444b1933b23 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs @@ -0,0 +1,327 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! The [landlock] docs say it best: +//! +//! > "Landlock is a security feature available since Linux 5.13. The goal is to enable to restrict +//! ambient rights (e.g., global filesystem access) for a set of processes by creating safe security +//! sandboxes as new security layers in addition to the existing system-wide access-controls. This +//! kind of sandbox is expected to help mitigate the security impact of bugs, unexpected or +//! malicious behaviors in applications. Landlock empowers any process, including unprivileged ones, +//! to securely restrict themselves." +//! +//! [landlock]: https://docs.rs/landlock/latest/landlock/index.html + +pub use landlock::RulesetStatus; + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use landlock::*; +use std::{ + path::{Path, PathBuf}, +}; + +/// Landlock ABI version. We use ABI V1 because: +/// +/// 1. It is supported by our reference kernel version. +/// 2. Later versions do not (yet) provide additional security that would benefit us. +/// +/// # Versions (as of October 2023) +/// +/// - Polkadot reference kernel version: 5.16+ +/// +/// - ABI V1: kernel 5.13 - Introduces landlock, including full restrictions on file reads. +/// +/// - ABI V2: kernel 5.19 - Adds ability to prevent file renaming. Does not help us. During +/// execution an attacker can only affect the name of a symlinked artifact and not the original +/// one. +/// +/// - ABI V3: kernel 6.2 - Adds ability to prevent file truncation. During execution, can +/// prevent attackers from affecting a symlinked artifact. We don't strictly need this as we +/// plan to check for file integrity anyway; see +/// . +/// +/// # Determinism +/// +/// You may wonder whether we could always use the latest ABI instead of only the ABI supported +/// by the reference kernel version. It seems plausible, since landlock provides a best-effort +/// approach to enabling sandboxing. For example, if the reference version only supported V1 and +/// we were on V2, then landlock would use V2 if it was supported on the current machine, and +/// just fall back to V1 if not. +/// +/// The issue with this is indeterminacy. If half of validators were on V2 and half were on V1, +/// they may have different semantics on some PVFs. So a malicious PVF now has a new attack +/// vector: they can exploit this indeterminism between landlock ABIs! +/// +/// On the other hand we do want validators to be as secure as possible and protect their keys +/// from attackers. And, the risk with indeterminacy is low and there are other indeterminacy +/// vectors anyway. So we will only upgrade to a new ABI if either the reference kernel version +/// supports it or if it introduces some new feature that is beneficial to security. +pub const LANDLOCK_ABI: ABI = ABI::V1; + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error("Invalid exception path: {0:?}")] + InvalidExceptionPath(PathBuf), + #[error(transparent)] + RulesetError(#[from] RulesetError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable landlock for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result { + let exceptions: Vec<(PathBuf, BitFlags)> = match worker_kind { + WorkerKind::Prepare => { + vec![(worker_dir_path.to_owned(), AccessFs::WriteFile.into())] + }, + WorkerKind::Execute => { + vec![(worker_dir_path.to_owned(), AccessFs::ReadFile.into())] + }, + WorkerKind::CheckPivotRoot => + panic!("this should only be passed for checking pivot_root; qed"), + }; + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling landlock with exceptions: {:?}", + exceptions, + ); + + try_restrict(exceptions) +} + +// TODO: +/// Runs a check for landlock and returns a single bool indicating whether the given landlock +/// ABI is fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result = + match std::thread::spawn(|| try_restrict(std::iter::empty::<(PathBuf, AccessFs)>())).join() + { + Ok(Ok(status)) => Ok(status), + Ok(Err(ruleset_err)) => Err(ruleset_err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(RulesetStatus::FullyEnforced)) +} + +/// Tries to restrict the current thread (should only be called in a process' main thread) with +/// the following landlock access controls: +/// +/// 1. all global filesystem access restricted, with optional exceptions +/// 2. ... more sandbox types (e.g. networking) may be supported in the future. +/// +/// If landlock is not supported in the current environment this is simply a noop. +/// +/// # Returns +/// +/// The status of the restriction (whether it was fully, partially, or not-at-all enforced). +fn try_restrict(fs_exceptions: I) -> Result +where + I: IntoIterator, + P: AsRef, + A: Into>, +{ + let mut ruleset = + Ruleset::default().handle_access(AccessFs::from_all(LANDLOCK_ABI))?.create()?; + for (fs_path, access_bits) in fs_exceptions { + let paths = &[fs_path.as_ref().to_owned()]; + let mut rules = path_beneath_rules(paths, access_bits).peekable(); + if rules.peek().is_none() { + // `path_beneath_rules` silently ignores missing paths, so check for it manually. + return Err(Error::InvalidExceptionPath(fs_path.as_ref().to_owned())) + } + ruleset = ruleset.add_rules(rules)?; + } + let status = ruleset.restrict_self()?; + Ok(status.ruleset) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{fs, io::ErrorKind, thread}; + + #[test] + fn restricted_thread_cannot_read_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot read from FS. + let handle = thread::spawn(|| { + // Create, write, and read two tmp files. This should succeed before any + // landlock restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + let s = fs::read_to_string(path1).unwrap(); + assert_eq!(s, TEXT); + fs::write(path2, TEXT).unwrap(); + let s = fs::read_to_string(path2).unwrap(); + assert_eq!(s, TEXT); + + // Apply Landlock with a read exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::ReadFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from both files, only tmpfile1 should succeed. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Ok(s) if s == TEXT + )); + let result = fs::read_to_string(path2); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to read from tmpfile1 after landlock, it should fail. + let result = fs::read_to_string(path1); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + #[test] + fn restricted_thread_cannot_write_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread cannot write to FS. + let handle = thread::spawn(|| { + // Create and write two tmp files. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile1 = tempfile::NamedTempFile::new().unwrap(); + let path1 = tmpfile1.path(); + let tmpfile2 = tempfile::NamedTempFile::new().unwrap(); + let path2 = tmpfile2.path(); + + fs::write(path1, TEXT).unwrap(); + fs::write(path2, TEXT).unwrap(); + + // Apply Landlock with a write exception for only one of the files. + let status = try_restrict(vec![(path1, AccessFs::WriteFile)]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to both files, only tmpfile1 should succeed. + let result = fs::write(path1, TEXT); + assert!(matches!(result, Ok(_))); + let result = fs::write(path2, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + + // Apply Landlock for all files. + let status = try_restrict(std::iter::empty::<(PathBuf, AccessFs)>()); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to write to tmpfile1 after landlock, it should fail. + let result = fs::write(path1, TEXT); + assert!(matches!( + result, + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } + + // Test that checks whether landlock under our ABI version is able to truncate files. + #[test] + fn restricted_thread_can_truncate_file() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + // Restricted thread can truncate file. + let handle = thread::spawn(|| { + // Create and write a file. This should succeed before any landlock + // restrictions are applied. + const TEXT: &str = "foo"; + let tmpfile = tempfile::NamedTempFile::new().unwrap(); + let path = tmpfile.path(); + + fs::write(path, TEXT).unwrap(); + + // Apply Landlock with all exceptions under the current ABI. + let status = try_restrict(vec![(path, AccessFs::from_all(LANDLOCK_ABI))]); + if !matches!(status, Ok(RulesetStatus::FullyEnforced)) { + panic!( + "Ruleset should be enforced since we checked if landlock is enabled: {:?}", + status + ); + } + + // Try to truncate the file. + let result = tmpfile.as_file().set_len(0); + assert!(result.is_ok()); + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs new file mode 100644 index 000000000000..0f74df148707 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -0,0 +1,206 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for securing workers. +//! +//! This is needed because workers are used to compile and execute untrusted code (PVFs). +//! +//! We currently employ the following security measures: +//! +//! - Restrict filesystem +//! - Use Landlock to remove all unnecessary FS access rights. +//! - Unshare the user and mount namespaces. +//! - Change the root directory to a worker-specific temporary directory. +//! - Remove env vars + +use crate::{worker::WorkerKind, LOG_TARGET}; + +#[cfg(target_os = "linux")] +pub mod landlock; + +#[cfg(all(target_os = "linux", target_arch = "x86_64"))] +pub mod seccomp; + +/// Unshare the user namespace and change root to be the artifact directory. +/// +/// NOTE: This should not be called in a multi-threaded context. `unshare(2)`: +/// "CLONE_NEWUSER requires that the calling process is not threaded." +#[cfg(target_os = "linux")] +pub fn unshare_user_namespace_and_change_root( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &std::path::Path, +) -> Result<(), String> { + use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; + + // The following was copied from the `cstr_core` crate. + // + // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 + #[inline] + #[doc(hidden)] + const fn cstr_is_valid(bytes: &[u8]) -> bool { + if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { + return false + } + + let mut index = 0; + while index < bytes.len() - 1 { + if bytes[index] == 0 { + return false + } + index += 1; + } + true + } + + macro_rules! cstr { + ($e:expr) => {{ + const STR: &[u8] = concat!($e, "\0").as_bytes(); + const STR_VALID: bool = cstr_is_valid(STR); + let _ = [(); 0 - (!(STR_VALID) as usize)]; + #[allow(unused_unsafe)] + unsafe { + core::ffi::CStr::from_bytes_with_nul_unchecked(STR) + } + }} + } + + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "unsharing the user namespace and calling pivot_root", + ); + + let worker_dir_path_c = CString::new(worker_dir_path.as_os_str().as_bytes()) + .expect("on unix; the path will never contain 0 bytes; qed"); + + // Wrapper around all the work to prevent repetitive error handling. + // + // # Errors + // + // It's the caller's responsibility to call `Error::last_os_error`. Note that that alone does + // not give the context of which call failed, so we return a &str error. + || -> Result<(), &'static str> { + // SAFETY: We pass null-terminated C strings and use the APIs as documented. In fact, steps + // (2) and (3) are adapted from the example in pivot_root(2), with the additional + // change described in the `pivot_root(".", ".")` section. + unsafe { + // 1. `unshare` the user and the mount namespaces. + if libc::unshare(libc::CLONE_NEWUSER | libc::CLONE_NEWNS) < 0 { + return Err("unshare user and mount namespaces") + } + + // 2. Setup mounts. + // + // Ensure that new root and its parent mount don't have shared propagation (which would + // cause pivot_root() to return an error), and prevent propagation of mount events to + // the initial mount namespace. + if libc::mount( + ptr::null(), + cstr!("/").as_ptr(), + ptr::null(), + libc::MS_REC | libc::MS_PRIVATE, + ptr::null(), + ) < 0 + { + return Err("mount MS_PRIVATE") + } + // Ensure that the new root is a mount point. + let additional_flags = + if let WorkerKind::Execute | WorkerKind::CheckPivotRoot = worker_kind { + libc::MS_RDONLY + } else { + 0 + }; + if libc::mount( + worker_dir_path_c.as_ptr(), + worker_dir_path_c.as_ptr(), + ptr::null(), // ignored when MS_BIND is used + libc::MS_BIND | + libc::MS_REC | libc::MS_NOEXEC | + libc::MS_NODEV | libc::MS_NOSUID | + libc::MS_NOATIME | additional_flags, + ptr::null(), // ignored when MS_BIND is used + ) < 0 + { + return Err("mount MS_BIND") + } + + // 3. `pivot_root` to the artifact directory. + if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { + return Err("chdir to worker dir path") + } + if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { + return Err("pivot_root") + } + if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { + return Err("umount the old root mount point") + } + } + + Ok(()) + }() + .map_err(|err_ctx| { + let err = std::io::Error::last_os_error(); + format!("{}: {}", err_ctx, err) + })?; + + // Do some assertions. + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected current dir after pivot_root to be `/`".into()) + } + env::set_current_dir("..").map_err(|err| err.to_string())?; + if env::current_dir().map_err(|err| err.to_string())? != Path::new("/") { + return Err("expected not to be able to break out of new root by doing `..`".into()) + } + + Ok(()) +} + +/// Require env vars to have been removed when spawning the process, to prevent malicious code from +/// accessing them. +pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { + let mut ok = true; + + for (key, value) in std::env::vars_os() { + // TODO: *theoretically* the value (or mere presence) of `RUST_LOG` can be a source of + // randomness for malicious code. In the future we can remove it also and log in the host; + // see . + if key == "RUST_LOG" { + continue + } + // An exception for MacOS. This is not a secure platform anyway, so we let it slide. + #[cfg(target_os = "macos")] + if key == "__CF_USER_TEXT_ENCODING" { + continue + } + + gum::error!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?key, + ?value, + "env var was present that should have been removed", + ); + + ok = false; + } + + ok +} diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs new file mode 100644 index 000000000000..f3c312cc4116 --- /dev/null +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -0,0 +1,197 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Functionality for sandboxing workers by restricting their capabilities by blocking certain +//! syscalls with seccomp. +//! +//! For security we block the following: +//! +//! - creation of new sockets - these are unneeded in PVF jobs, and we can safely block them without +//! affecting consensus. +//! +//! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the +//! safety of doing this. +//! +//! - `connect`ing to sockets - the above two points are enough for networking. However, it is +//! possible to [connect to abstract unix +//! sockets](https://lore.kernel.org/landlock/20231023.ahphah4Wii4v@digikod.net/T/#u) to do some +//! kinds of sandbox escapes, so we also block the `connect` syscall. +//! +//! # Safety of blocking io_uring +//! +//! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing +//! stopping wasmtime from legitimately using it. Fortunately, at the moment it does not. Generally, +//! not many applications use `io_uring` in production yet, because of the numerous kernel CVEs +//! discovered. It's still under a lot of development. Android outright banned `io_uring` for these +//! reasons. +//! +//! Considering `io_uring`'s status discussed above, and that it very likely would get detected +//! either by our [static analysis](https://github.com/paritytech/polkadot-sdk/pull/1663) or by +//! testing, we think it is safe to block it. +//! +//! ## Consensus analysis +//! +//! If execution hits an edge case code path unique to a given machine, it's already taken a +//! non-deterministic branch anyway. After all, we just care that the majority of validators reach +//! the same result and preserve consensus. So worst-case scenario, there's a dispute, and we can +//! always admit fault and refund the wrong validator. On the other hand, if all validators take the +//! code path that results in a seccomp violation, then they would all vote against the current +//! candidate, which is also fine. The violation would get logged (in big scary letters) and +//! hopefully some validator reports it to us. +//! +//! Actually, a worst-worse-case scenario is that 50% of validators vote against, so that there is +//! no consensus. But so many things would have to go wrong for that to happen: +//! +//! 1. An update to `wasmtime` is introduced that uses io_uring (unlikely as io_uring is mainly for +//! IO-heavy applications) +//! +//! 2. The new syscall is not detected by our static analysis +//! +//! 3. It is never triggered in any of our tests +//! +//! 4. It then gets triggered on some super edge case in production on 50% of validators causing a +//! stall (bad but very unlikely) +//! +//! 5. Or, it triggers on only a few validators causing a dispute (more likely but not as bad) +//! +//! Considering how many things would have to go wrong here, we believe it's safe to block +//! `io_uring`. +//! +//! # Action on caught syscall +//! +//! TODO + +use crate::{ + worker::{stringify_panic_payload, WorkerKind}, + LOG_TARGET, +}; +use seccompiler::*; +use std::{collections::BTreeMap, path::Path}; + +/// The action to take on caught syscalls. +#[cfg(not(test))] +const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess; +/// Don't kill the process when testing. +#[cfg(test)] +const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32); + +#[derive(thiserror::Error, Debug)] +pub enum Error { + #[error(transparent)] + Seccomp(#[from] seccompiler::Error), + #[error(transparent)] + Backend(#[from] seccompiler::BackendError), + #[error("A panic occurred in try_restrict: {0}")] + Panic(String), +} + +pub type Result = std::result::Result; + +/// Try to enable seccomp for the given kind of worker. +pub fn enable_for_worker( + worker_kind: WorkerKind, + worker_pid: u32, + worker_dir_path: &Path, +) -> Result<()> { + gum::debug!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + ?worker_dir_path, + "enabling seccomp", + ); + + try_restrict() +} + +/// Runs a check for seccomp and returns a single bool indicating whether seccomp with our rules is +/// fully enabled on the current Linux environment. +pub fn check_is_fully_enabled() -> bool { + let status_from_thread: Result<()> = match std::thread::spawn(|| try_restrict()).join() { + Ok(Ok(())) => Ok(()), + Ok(Err(err)) => Err(err.into()), + Err(err) => Err(Error::Panic(stringify_panic_payload(err))), + }; + + matches!(status_from_thread, Ok(())) +} + +/// Applies a `seccomp` filter to disable networking for the PVF threads. +pub fn try_restrict() -> Result<()> { + // Build a `seccomp` filter which by default allows all syscalls except those blocked in the + // blacklist. + let mut blacklisted_rules = BTreeMap::default(); + + // Restrict the creation of sockets. + blacklisted_rules.insert(libc::SYS_socketpair, vec![]); + blacklisted_rules.insert(libc::SYS_socket, vec![]); + + // Restrict io_uring. + blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]); + blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]); + + // Restrict connecting to abstract unix sockets. + blacklisted_rules.insert(libc::SYS_connect, vec![]); + + let filter = SeccompFilter::new( + blacklisted_rules, + // Mismatch action: what to do if not in rule list. + SeccompAction::Allow, + // Match action: what to do if in rule list. + CAUGHT_ACTION, + TargetArch::x86_64, + )?; + + let bpf_prog: BpfProgram = filter.try_into()?; + + // Applies filter (runs seccomp) to the calling thread. + seccompiler::apply_filter(&bpf_prog)?; + + Ok(()) +} + +#[cfg(test)] +mod tests { + use super::*; + use std::{io::ErrorKind, net::TcpListener, thread}; + + #[test] + fn sandboxed_thread_cannot_use_sockets() { + // TODO: This would be nice: . + if !check_is_fully_enabled() { + return + } + + let handle = thread::spawn(|| { + // Open a socket, this should succeed before seccomp is applied. + TcpListener::bind("127.0.0.1:0").unwrap(); + + let status = try_restrict(); + if !matches!(status, Ok(())) { + panic!("Ruleset should be enforced since we checked if seccomp is enabled"); + } + + // Try to open a socket after seccomp. + assert!(matches!( + TcpListener::bind("127.0.0.1:0"), + Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) + )); + }); + + assert!(handle.join().is_ok()); + } +} diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 81695829122b..e3fcc294298e 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -29,7 +29,7 @@ use crate::{ use always_assert::never; use futures::{ channel::{mpsc, oneshot}, - Future, FutureExt, SinkExt, StreamExt, + join, Future, FutureExt, SinkExt, StreamExt, }; use polkadot_node_core_pvf_common::{ error::{PrepareError, PrepareResult}, @@ -200,15 +200,21 @@ impl Config { /// The future should not return normally but if it does then that indicates an unrecoverable error. /// In that case all pending requests will be canceled, dropping the result senders and new ones /// will be rejected. -pub fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { +pub async fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Future) { gum::debug!(target: LOG_TARGET, ?config, "starting PVF validation host"); // Run checks for supported security features once per host startup. Warn here if not enabled. let security_status = { - let can_enable_landlock = check_landlock(&config.prepare_worker_program_path); - let can_unshare_user_namespace_and_change_root = - check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path); - SecurityStatus { can_enable_landlock, can_unshare_user_namespace_and_change_root } + let (can_enable_landlock, can_enable_seccomp, can_unshare_user_namespace_and_change_root) = join!( + check_landlock(&config.prepare_worker_program_path), + check_seccomp(&config.prepare_worker_program_path), + check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path) + ); + SecurityStatus { + can_enable_landlock, + can_enable_seccomp, + can_unshare_user_namespace_and_change_root, + } }; let (to_host_tx, to_host_rx) = mpsc::channel(10); @@ -887,17 +893,17 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream /// We do this check by spawning a new process and trying to sandbox it. To get as close as possible /// to running the check in a worker, we try it... in a worker. The expected return status is 0 on /// success and -1 on failure. -fn check_can_unshare_user_namespace_and_change_root( +async fn check_can_unshare_user_namespace_and_change_root( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - let output = std::process::Command::new(prepare_worker_program_path) + match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-unshare-user-namespace-and-change-root") - .output(); - - match output { + .output() + .await + { Ok(output) if output.status.success() => true, Ok(output) => { let stderr = std::str::from_utf8(&output.stderr) @@ -938,15 +944,16 @@ fn check_can_unshare_user_namespace_and_change_root( /// We do this check by spawning a new process and trying to sandbox it. To get as close as possible /// to running the check in a worker, we try it... in a worker. The expected return status is 0 on /// success and -1 on failure. -fn check_landlock( +async fn check_landlock( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] prepare_worker_program_path: &Path, ) -> bool { cfg_if::cfg_if! { if #[cfg(target_os = "linux")] { - match std::process::Command::new(prepare_worker_program_path) + match tokio::process::Command::new(prepare_worker_program_path) .arg("--check-can-enable-landlock") .status() + .await { Ok(status) if status.success() => true, Ok(status) => { @@ -981,6 +988,52 @@ fn check_landlock( } } +/// Check if seccomp is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +async fn check_seccomp( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-seccomp") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + "Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security." + ); + false + } + } +} + #[cfg(test)] pub(crate) mod tests { use super::*; diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index bd85d84055ce..906ef8401168 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -270,6 +270,9 @@ impl WorkerHandle { if security_status.can_enable_landlock { args.push("--can-enable-landlock".to_string()); } + if security_status.can_enable_seccomp { + args.push("--can-enable-seccomp".to_string()); + } if security_status.can_unshare_user_namespace_and_change_root { args.push("--can-unshare-user-namespace-and-change-root".to_string()); } diff --git a/polkadot/node/core/pvf/tests/it/adder.rs b/polkadot/node/core/pvf/tests/it/adder.rs index 8bdd09db208a..e8d8a9a6b63e 100644 --- a/polkadot/node/core/pvf/tests/it/adder.rs +++ b/polkadot/node/core/pvf/tests/it/adder.rs @@ -28,7 +28,7 @@ async fn execute_good_block_on_parent() { let block_data = BlockData { state: 0, add: 512 }; - let host = TestHost::new(); + let host = TestHost::new().await; let ret = host .validate_candidate( @@ -56,7 +56,7 @@ async fn execute_good_chain_on_parent() { let mut parent_hash = [0; 32]; let mut last_state = 0; - let host = TestHost::new(); + let host = TestHost::new().await; for (number, add) in (0..10).enumerate() { let parent_head = @@ -98,7 +98,7 @@ async fn execute_bad_block_on_parent() { add: 256, }; - let host = TestHost::new(); + let host = TestHost::new().await; let _err = host .validate_candidate( @@ -117,7 +117,7 @@ async fn execute_bad_block_on_parent() { #[tokio::test] async fn stress_spawn() { - let host = std::sync::Arc::new(TestHost::new()); + let host = std::sync::Arc::new(TestHost::new().await); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; @@ -149,9 +149,12 @@ async fn stress_spawn() { // With one worker, run multiple execution jobs serially. They should not conflict. #[tokio::test] async fn execute_can_run_serially() { - let host = std::sync::Arc::new(TestHost::new_with_config(|cfg| { - cfg.execute_workers_max_num = 1; - })); + let host = std::sync::Arc::new( + TestHost::new_with_config(|cfg| { + cfg.execute_workers_max_num = 1; + }) + .await, + ); async fn execute(host: std::sync::Arc) { let parent_head = HeadData { number: 0, parent_hash: [0; 32], post_state: hash_state(0) }; diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index f699b5840d8f..c3b338538361 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -42,11 +42,11 @@ struct TestHost { } impl TestHost { - fn new() -> Self { - Self::new_with_config(|_| ()) + async fn new() -> Self { + Self::new_with_config(|_| ()).await } - fn new_with_config(f: F) -> Self + async fn new_with_config(f: F) -> Self where F: FnOnce(&mut Config), { @@ -65,7 +65,7 @@ impl TestHost { execute_worker_path, ); f(&mut config); - let (host, task) = start(config, Metrics::default()); + let (host, task) = start(config, Metrics::default()).await; let _ = tokio::task::spawn(task); Self { cache_dir, host: Mutex::new(host) } } @@ -131,7 +131,7 @@ impl TestHost { #[tokio::test] async fn terminates_on_timeout() { - let host = TestHost::new(); + let host = TestHost::new().await; let start = std::time::Instant::now(); let result = host @@ -161,7 +161,7 @@ async fn terminates_on_timeout() { #[tokio::test] async fn ensure_parallel_execution() { // Run some jobs that do not complete, thus timing out. - let host = TestHost::new(); + let host = TestHost::new().await; let execute_pvf_future_1 = host.validate_candidate( halt::wasm_binary_unwrap(), ValidationParams { @@ -208,7 +208,7 @@ async fn ensure_parallel_execution() { async fn execute_queue_doesnt_stall_if_workers_died() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 5; - }); + }).await; // Here we spawn 8 validation jobs for the `halt` PVF and share those between 5 workers. The // first five jobs should timeout and the workers killed. For the next 3 jobs a new batch of @@ -245,7 +245,7 @@ async fn execute_queue_doesnt_stall_if_workers_died() { async fn execute_queue_doesnt_stall_with_varying_executor_params() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 2; - }); + }).await; let executor_params_1 = ExecutorParams::default(); let executor_params_2 = ExecutorParams::from(&[ExecutorParam::StackLogicalMax(1024)][..]); @@ -293,7 +293,7 @@ async fn execute_queue_doesnt_stall_with_varying_executor_params() { // Test that deleting a prepared artifact does not lead to a dispute when we try to execute it. #[tokio::test] async fn deleting_prepared_artifact_does_not_dispute() { - let host = TestHost::new(); + let host = TestHost::new().await; let cache_dir = host.cache_dir.path(); let result = host @@ -354,7 +354,7 @@ async fn deleting_prepared_artifact_does_not_dispute() { async fn prepare_can_run_serially() { let host = TestHost::new_with_config(|cfg| { cfg.prepare_workers_hard_max_num = 1; - }); + }).await; let _stats = host .precheck_pvf(::adder::wasm_binary_unwrap(), Default::default()) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 6a14a3a013d4..41ee02028b0d 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -126,6 +126,17 @@ with untrusted code does not have unnecessary access to the file-system. This provides some protection against attackers accessing sensitive data or modifying data on the host machine. +*Currently this is only supported on Linux.* + +### Restricting networking + +We also disable networking on PVF threads by disabling certain syscalls, such as +the creation of sockets. This prevents attackers from either downloading +payloads or communicating sensitive data from the validator's machine to the +outside world. + +*Currently this is only supported on Linux.* + ### Clearing env vars We clear environment variables before handling untrusted code, because why give From 2bae7a622a4c3f75ccded9d408a6a5ecbe3a787e Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 24 Oct 2023 16:14:48 +0200 Subject: [PATCH 02/12] Remove tokio from workers Starting the tokio runtime was calling `socketpair` and triggering the new seccomp filter. Removed tokio since we wanted to do it soon anyway as part of https://github.com/paritytech/polkadot-sdk/issues/649. --- Cargo.lock | 3 --- polkadot/node/core/pvf/common/Cargo.toml | 1 - polkadot/node/core/pvf/common/src/lib.rs | 3 +-- .../node/core/pvf/common/src/worker/mod.rs | 25 ++++++------------- .../common/src/worker/security/landlock.rs | 6 ++--- .../pvf/common/src/worker/security/mod.rs | 9 ++++++- .../pvf/common/src/worker/security/seccomp.rs | 2 +- .../node/core/pvf/execute-worker/Cargo.toml | 1 - .../node/core/pvf/execute-worker/src/lib.rs | 4 +-- .../node/core/pvf/prepare-worker/Cargo.toml | 1 - .../node/core/pvf/prepare-worker/src/lib.rs | 9 +++---- .../pvf/prepare-worker/src/memory_stats.rs | 2 +- polkadot/node/core/pvf/tests/it/main.rs | 9 ++++--- 13 files changed, 33 insertions(+), 42 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e57e0265457c..10249c478a60 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -12258,7 +12258,6 @@ dependencies = [ "sp-tracing", "tempfile", "thiserror", - "tokio", "tracing-gum", ] @@ -12276,7 +12275,6 @@ dependencies = [ "sp-core", "sp-maybe-compressed-blob", "sp-tracing", - "tokio", "tracing-gum", ] @@ -12299,7 +12297,6 @@ dependencies = [ "sp-maybe-compressed-blob", "sp-tracing", "tikv-jemalloc-ctl", - "tokio", "tracing-gum", ] diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index e4ca20ffdb29..9287263ab50c 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -12,7 +12,6 @@ cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" -tokio = { version = "1.24.2", features = ["fs", "process", "io-util"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/common/src/lib.rs b/polkadot/node/core/pvf/common/src/lib.rs index 813dccadf055..e2211b97d87b 100644 --- a/polkadot/node/core/pvf/common/src/lib.rs +++ b/polkadot/node/core/pvf/common/src/lib.rs @@ -32,10 +32,9 @@ pub use sp_tracing; const LOG_TARGET: &str = "parachain::pvf-common"; use std::{ - io::{Read, Write}, + io::{self, Read, Write}, mem, }; -use tokio::io; #[cfg(feature = "test-utils")] pub mod tests { diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index d1e17bac24ff..f0a3e7e5e98a 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -23,13 +23,12 @@ use cpu_time::ProcessTime; use futures::never::Never; use std::{ any::Any, - fmt, + fmt, io, os::unix::net::UnixStream, path::PathBuf, sync::mpsc::{Receiver, RecvTimeoutError}, time::Duration, }; -use tokio::{io, runtime::Runtime}; /// Use this macro to declare a `fn main() {}` that will create an executable that can be used for /// spawning the desired worker. @@ -198,7 +197,7 @@ impl fmt::Display for WorkerKind { // The worker version must be passed in so that we accurately get the version of the worker, and not // the version that this crate was compiled with. -pub fn worker_event_loop( +pub fn worker_event_loop( worker_kind: WorkerKind, socket_path: PathBuf, #[cfg_attr(not(target_os = "linux"), allow(unused_mut))] mut worker_dir_path: PathBuf, @@ -207,8 +206,7 @@ pub fn worker_event_loop( #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] security_status: &SecurityStatus, mut event_loop: F, ) where - F: FnMut(UnixStream, PathBuf) -> Fut, - Fut: futures::Future>, + F: FnMut(UnixStream, PathBuf) -> io::Result, { let worker_pid = std::process::id(); gum::debug!( @@ -262,7 +260,7 @@ pub fn worker_event_loop( } // Connect to the socket. - let stream = || -> std::io::Result { + let stream = || -> io::Result { let stream = UnixStream::connect(&socket_path)?; let _ = std::fs::remove_file(&socket_path); Ok(stream) @@ -362,18 +360,11 @@ pub fn worker_event_loop( } // Run the main worker loop. - let rt = Runtime::new().expect("Creates tokio runtime. If this panics the worker will die and the host will detect that and deal with it."); - let err = rt - .block_on(event_loop(stream, worker_dir_path)) + let err = event_loop(stream, worker_dir_path) // It's never `Ok` because it's `Ok(Never)`. .unwrap_err(); worker_shutdown_message(worker_kind, worker_pid, &err.to_string()); - - // We don't want tokio to wait for the tasks to finish. We want to bring down the worker as fast - // as possible and not wait for stalled validation to finish. This isn't strictly necessary now, - // but may be in the future. - rt.shutdown_background(); } /// Provide a consistent message on worker shutdown. @@ -454,7 +445,7 @@ fn kill_parent_node_in_emergency() { /// The motivation for this module is to coordinate worker threads without using async Rust. pub mod thread { use std::{ - panic, + io, panic, sync::{Arc, Condvar, Mutex}, thread, time::Duration, @@ -495,7 +486,7 @@ pub mod thread { f: F, cond: Cond, outcome: WaitOutcome, - ) -> std::io::Result> + ) -> io::Result> where F: FnOnce() -> R, F: Send + 'static + panic::UnwindSafe, @@ -513,7 +504,7 @@ pub mod thread { cond: Cond, outcome: WaitOutcome, stack_size: usize, - ) -> std::io::Result> + ) -> io::Result> where F: FnOnce() -> R, F: Send + 'static + panic::UnwindSafe, diff --git a/polkadot/node/core/pvf/common/src/worker/security/landlock.rs b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs index 5444b1933b23..51500c733b8c 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/landlock.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/landlock.rs @@ -32,9 +32,7 @@ use crate::{ LOG_TARGET, }; use landlock::*; -use std::{ - path::{Path, PathBuf}, -}; +use std::path::{Path, PathBuf}; /// Landlock ABI version. We use ABI V1 because: /// @@ -103,7 +101,7 @@ pub fn enable_for_worker( panic!("this should only be passed for checking pivot_root; qed"), }; - gum::debug!( + gum::trace!( target: LOG_TARGET, %worker_kind, %worker_pid, diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs index 0f74df148707..1c0d80237c77 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -78,7 +78,7 @@ pub fn unshare_user_namespace_and_change_root( }} } - gum::debug!( + gum::trace!( target: LOG_TARGET, %worker_kind, %worker_pid, @@ -175,6 +175,13 @@ pub fn unshare_user_namespace_and_change_root( /// Require env vars to have been removed when spawning the process, to prevent malicious code from /// accessing them. pub fn check_env_vars_were_cleared(worker_kind: WorkerKind, worker_pid: u32) -> bool { + gum::trace!( + target: LOG_TARGET, + %worker_kind, + %worker_pid, + "clearing env vars in worker", + ); + let mut ok = true; for (key, value) in std::env::vars_os() { diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs index f3c312cc4116..38cdae249b86 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -106,7 +106,7 @@ pub fn enable_for_worker( worker_pid: u32, worker_dir_path: &Path, ) -> Result<()> { - gum::debug!( + gum::trace!( target: LOG_TARGET, %worker_kind, %worker_pid, diff --git a/polkadot/node/core/pvf/execute-worker/Cargo.toml b/polkadot/node/core/pvf/execute-worker/Cargo.toml index 23678d95696e..203bbd0e7859 100644 --- a/polkadot/node/core/pvf/execute-worker/Cargo.toml +++ b/polkadot/node/core/pvf/execute-worker/Cargo.toml @@ -11,7 +11,6 @@ cpu-time = "1.0.0" futures = "0.3.21" gum = { package = "tracing-gum", path = "../../../gum" } rayon = "1.5.1" -tokio = { version = "1.24.2", features = ["fs", "process"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/execute-worker/src/lib.rs b/polkadot/node/core/pvf/execute-worker/src/lib.rs index af73eb16e685..8872f9bc8dd3 100644 --- a/polkadot/node/core/pvf/execute-worker/src/lib.rs +++ b/polkadot/node/core/pvf/execute-worker/src/lib.rs @@ -39,12 +39,12 @@ use polkadot_node_core_pvf_common::{ use polkadot_parachain_primitives::primitives::ValidationResult; use polkadot_primitives::{executor_params::DEFAULT_NATIVE_STACK_MAX, ExecutorParams}; use std::{ + io, os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::io; // Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code. // That native code does not create any stacks and just reuses the stack of the thread that @@ -138,7 +138,7 @@ pub fn worker_entrypoint( node_version, worker_version, &security_status, - |mut stream, worker_dir_path| async move { + |mut stream, worker_dir_path| { let worker_pid = std::process::id(); let artifact_path = worker_dir::execute_artifact(&worker_dir_path); diff --git a/polkadot/node/core/pvf/prepare-worker/Cargo.toml b/polkadot/node/core/pvf/prepare-worker/Cargo.toml index 886209b78c32..eb53ebdc941b 100644 --- a/polkadot/node/core/pvf/prepare-worker/Cargo.toml +++ b/polkadot/node/core/pvf/prepare-worker/Cargo.toml @@ -13,7 +13,6 @@ gum = { package = "tracing-gum", path = "../../../gum" } libc = "0.2.139" rayon = "1.5.1" tikv-jemalloc-ctl = { version = "0.5.0", optional = true } -tokio = { version = "1.24.2", features = ["fs", "process"] } parity-scale-codec = { version = "3.6.1", default-features = false, features = ["derive"] } diff --git a/polkadot/node/core/pvf/prepare-worker/src/lib.rs b/polkadot/node/core/pvf/prepare-worker/src/lib.rs index fa5d3656a35e..926d9cabe182 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/lib.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/lib.rs @@ -45,12 +45,12 @@ use polkadot_node_core_pvf_common::{ }; use polkadot_primitives::ExecutorParams; use std::{ + fs, io, os::unix::net::UnixStream, path::PathBuf, sync::{mpsc::channel, Arc}, time::Duration, }; -use tokio::io; /// Contains the bytes for a successfully compiled artifact. pub struct CompiledArtifact(Vec); @@ -131,7 +131,7 @@ pub fn worker_entrypoint( node_version, worker_version, &security_status, - |mut stream, worker_dir_path| async move { + |mut stream, worker_dir_path| { let worker_pid = std::process::id(); let temp_artifact_dest = worker_dir::prepare_tmp_artifact(&worker_dir_path); @@ -229,8 +229,7 @@ pub fn worker_entrypoint( // Stop the memory stats worker and get its observed memory stats. #[cfg(any(target_os = "linux", feature = "jemalloc-allocator"))] - let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid) - .await; + let memory_tracker_stats = get_memory_tracker_loop_stats(memory_tracker_thread, worker_pid); let memory_stats = MemoryStats { #[cfg(any( target_os = "linux", @@ -255,7 +254,7 @@ pub fn worker_entrypoint( "worker: writing artifact to {}", temp_artifact_dest.display(), ); - tokio::fs::write(&temp_artifact_dest, &artifact).await?; + fs::write(&temp_artifact_dest, &artifact)?; Ok(PrepareStats { cpu_time_elapsed, memory_stats }) }, diff --git a/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs b/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs index c70ff56fc84d..5f577b0901c2 100644 --- a/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs +++ b/polkadot/node/core/pvf/prepare-worker/src/memory_stats.rs @@ -122,7 +122,7 @@ pub mod memory_tracker { } /// Helper function to get the stats from the memory tracker. Helps isolate this error handling. - pub async fn get_memory_tracker_loop_stats( + pub fn get_memory_tracker_loop_stats( thread: JoinHandle>, worker_pid: u32, ) -> Option { diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index c3b338538361..af6a5f09ba17 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -208,7 +208,8 @@ async fn ensure_parallel_execution() { async fn execute_queue_doesnt_stall_if_workers_died() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 5; - }).await; + }) + .await; // Here we spawn 8 validation jobs for the `halt` PVF and share those between 5 workers. The // first five jobs should timeout and the workers killed. For the next 3 jobs a new batch of @@ -245,7 +246,8 @@ async fn execute_queue_doesnt_stall_if_workers_died() { async fn execute_queue_doesnt_stall_with_varying_executor_params() { let host = TestHost::new_with_config(|cfg| { cfg.execute_workers_max_num = 2; - }).await; + }) + .await; let executor_params_1 = ExecutorParams::default(); let executor_params_2 = ExecutorParams::from(&[ExecutorParam::StackLogicalMax(1024)][..]); @@ -354,7 +356,8 @@ async fn deleting_prepared_artifact_does_not_dispute() { async fn prepare_can_run_serially() { let host = TestHost::new_with_config(|cfg| { cfg.prepare_workers_hard_max_num = 1; - }).await; + }) + .await; let _stats = host .precheck_pvf(::adder::wasm_binary_unwrap(), Default::default()) From a7ce198c044c542560ebe6808e01848e5f2d9716 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 25 Oct 2023 10:51:59 +0200 Subject: [PATCH 03/12] Address review --- Cargo.lock | 4 ++-- polkadot/node/core/pvf/common/Cargo.toml | 2 +- polkadot/node/core/pvf/common/src/worker/mod.rs | 4 ++-- .../core/pvf/common/src/worker/security/mod.rs | 1 + .../pvf/common/src/worker/security/seccomp.rs | 16 ++++++++-------- 5 files changed, 14 insertions(+), 13 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 10249c478a60..8c9635c54879 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -16101,9 +16101,9 @@ dependencies = [ [[package]] name = "seccompiler" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6f6575e3c2b3a0fe2ef3e53855b6a8dead7c29f783da5e123d378c8c6a89017e" +checksum = "345a3e4dddf721a478089d4697b83c6c0a8f5bf16086f6c13397e4534eb6e2e5" dependencies = [ "libc", ] diff --git a/polkadot/node/core/pvf/common/Cargo.toml b/polkadot/node/core/pvf/common/Cargo.toml index 9287263ab50c..dacdd1fcba2b 100644 --- a/polkadot/node/core/pvf/common/Cargo.toml +++ b/polkadot/node/core/pvf/common/Cargo.toml @@ -29,7 +29,7 @@ sp-tracing = { path = "../../../../../substrate/primitives/tracing" } [target.'cfg(target_os = "linux")'.dependencies] landlock = "0.3.0" -seccompiler = "0.3.0" +seccompiler = "0.4.0" thiserror = "1.0.31" [dev-dependencies] diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index f0a3e7e5e98a..4d87797ab25e 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -320,7 +320,7 @@ pub fn worker_event_loop( target: LOG_TARGET, %worker_kind, %worker_pid, - "could not fully enable landlock: {:?}. This should not happen, please report to the Polkadot devs", + "could not fully enable landlock: {:?}. This should not happen, please report an issue", landlock_status ); } @@ -339,7 +339,7 @@ pub fn worker_event_loop( target: LOG_TARGET, %worker_kind, %worker_pid, - "could not fully enable seccomp: {:?}. This should not happen, please report to the Polkadot devs", + "could not fully enable seccomp: {:?}. This should not happen, please report an issue", seccomp_status ); } diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs index 1c0d80237c77..63c5bb603120 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -24,6 +24,7 @@ //! - Use Landlock to remove all unnecessary FS access rights. //! - Unshare the user and mount namespaces. //! - Change the root directory to a worker-specific temporary directory. +//! - Restrict networking by blocking socket creation and io_uring. //! - Remove env vars use crate::{worker::WorkerKind, LOG_TARGET}; diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs index 38cdae249b86..de719d93e31d 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -25,11 +25,6 @@ //! - `io_uring` - allows for networking and needs to be blocked. See below for a discussion on the //! safety of doing this. //! -//! - `connect`ing to sockets - the above two points are enough for networking. However, it is -//! possible to [connect to abstract unix -//! sockets](https://lore.kernel.org/landlock/20231023.ahphah4Wii4v@digikod.net/T/#u) to do some -//! kinds of sandbox escapes, so we also block the `connect` syscall. -//! //! # Safety of blocking io_uring //! //! `io_uring` is just a way of issuing system calls in an async manner, and there is nothing @@ -139,14 +134,14 @@ pub fn try_restrict() -> Result<()> { blacklisted_rules.insert(libc::SYS_socketpair, vec![]); blacklisted_rules.insert(libc::SYS_socket, vec![]); + // Prevent connecting to sockets for extra safety. + blacklisted_rules.insert(libc::SYS_connect, vec![]); + // Restrict io_uring. blacklisted_rules.insert(libc::SYS_io_uring_setup, vec![]); blacklisted_rules.insert(libc::SYS_io_uring_enter, vec![]); blacklisted_rules.insert(libc::SYS_io_uring_register, vec![]); - // Restrict connecting to abstract unix sockets. - blacklisted_rules.insert(libc::SYS_connect, vec![]); - let filter = SeccompFilter::new( blacklisted_rules, // Mismatch action: what to do if not in rule list. @@ -190,6 +185,11 @@ mod tests { TcpListener::bind("127.0.0.1:0"), Err(err) if matches!(err.kind(), ErrorKind::PermissionDenied) )); + + // Other syscalls should still work. + unsafe { + assert!(libc::getppid() > 0); + } }); assert!(handle.join().is_ok()); From cd10afac1ddf2b738b944586cfba217d56fc0949 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 25 Oct 2023 10:54:19 +0200 Subject: [PATCH 04/12] Update syscall lists --- polkadot/scripts/list-syscalls/execute-worker-syscalls | 9 --------- polkadot/scripts/list-syscalls/prepare-worker-syscalls | 9 --------- 2 files changed, 18 deletions(-) diff --git a/polkadot/scripts/list-syscalls/execute-worker-syscalls b/polkadot/scripts/list-syscalls/execute-worker-syscalls index 05abe9ba7368..4a7a66181299 100644 --- a/polkadot/scripts/list-syscalls/execute-worker-syscalls +++ b/polkadot/scripts/list-syscalls/execute-worker-syscalls @@ -24,10 +24,8 @@ 42 (connect) 45 (recvfrom) 46 (sendmsg) -53 (socketpair) 56 (clone) 60 (exit) -61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -52,23 +50,16 @@ 200 (tkill) 202 (futex) 204 (sched_getaffinity) -213 (epoll_create) 217 (getdents64) 218 (set_tid_address) 228 (clock_gettime) 230 (clock_nanosleep) 231 (exit_group) -232 (epoll_wait) -233 (epoll_ctl) 257 (openat) 262 (newfstatat) 263 (unlinkat) 272 (unshare) 273 (set_robust_list) -281 (epoll_pwait) -284 (eventfd) -290 (eventfd2) -291 (epoll_create1) 302 (prlimit64) 318 (getrandom) 319 (memfd_create) diff --git a/polkadot/scripts/list-syscalls/prepare-worker-syscalls b/polkadot/scripts/list-syscalls/prepare-worker-syscalls index f1597f206756..cab58e06692b 100644 --- a/polkadot/scripts/list-syscalls/prepare-worker-syscalls +++ b/polkadot/scripts/list-syscalls/prepare-worker-syscalls @@ -24,10 +24,8 @@ 42 (connect) 45 (recvfrom) 46 (sendmsg) -53 (socketpair) 56 (clone) 60 (exit) -61 (wait4) 62 (kill) 72 (fcntl) 79 (getcwd) @@ -54,23 +52,16 @@ 202 (futex) 203 (sched_setaffinity) 204 (sched_getaffinity) -213 (epoll_create) 217 (getdents64) 218 (set_tid_address) 228 (clock_gettime) 230 (clock_nanosleep) 231 (exit_group) -232 (epoll_wait) -233 (epoll_ctl) 257 (openat) 262 (newfstatat) 263 (unlinkat) 272 (unshare) 273 (set_robust_list) -281 (epoll_pwait) -284 (eventfd) -290 (eventfd2) -291 (epoll_create1) 302 (prlimit64) 309 (getcpu) 318 (getrandom) From f0b41bdf7e3896119fc27da0d6a1204fcc1931cd Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 25 Oct 2023 11:24:05 +0200 Subject: [PATCH 05/12] Fix benchmark --- .../pvf/benches/host_prepare_rococo_runtime.rs | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs index 3069fa2b194b..acd80526262c 100644 --- a/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs +++ b/polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs @@ -17,18 +17,15 @@ //! Benchmarks for preparation through the host. We use a real PVF to get realistic results. use criterion::{criterion_group, criterion_main, BatchSize, Criterion, SamplingMode}; -use parity_scale_codec::Encode; use polkadot_node_core_pvf::{ start, testing, Config, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData, - ValidationError, ValidationHost, + ValidationHost, }; -use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::ExecutorParams; use rococo_runtime::WASM_BINARY; use std::time::Duration; use tokio::{runtime::Handle, sync::Mutex}; -const TEST_EXECUTION_TIMEOUT: Duration = Duration::from_secs(3); const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30); struct TestHost { @@ -36,7 +33,7 @@ struct TestHost { } impl TestHost { - fn new_with_config(handle: &Handle, f: F) -> Self + async fn new_with_config(handle: &Handle, f: F) -> Self where F: FnOnce(&mut Config), { @@ -50,7 +47,7 @@ impl TestHost { execute_worker_path, ); f(&mut config); - let (host, task) = start(config, Metrics::default()); + let (host, task) = start(config, Metrics::default()).await; let _ = handle.spawn(task); Self { host: Mutex::new(host) } } @@ -107,15 +104,18 @@ fn host_prepare_rococo_runtime(c: &mut Criterion) { group.measurement_time(Duration::from_secs(240)); group.bench_function("host: prepare Rococo runtime", |b| { b.to_async(&rt).iter_batched( - || { + || async { ( TestHost::new_with_config(rt.handle(), |cfg| { cfg.prepare_workers_hard_max_num = 1; - }), + }) + .await, pvf.clone().code(), ) }, - |(host, pvf_code)| async move { + |result| async move { + let (host, pvf_code) = result.await; + // `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the // benchmark accuracy. let _stats = host.precheck_pvf(&pvf_code, Default::default()).await.unwrap(); From 122eeb48364fcbe1428cb64a087a924423ba9eb4 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Wed, 25 Oct 2023 15:05:19 +0200 Subject: [PATCH 06/12] Address some comments --- .../node/core/pvf/common/src/worker/mod.rs | 8 ++-- .../pvf/common/src/worker/security/mod.rs | 39 ++++--------------- 2 files changed, 12 insertions(+), 35 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/mod.rs b/polkadot/node/core/pvf/common/src/worker/mod.rs index 25c75d24849c..274a2fc80397 100644 --- a/polkadot/node/core/pvf/common/src/worker/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/mod.rs @@ -85,9 +85,9 @@ macro_rules! decl_worker_main { std::process::exit(status) }, "--check-can-enable-seccomp" => { - #[cfg(target_os = "linux")] + #[cfg(all(target_os = "linux", target_arch = "x86_64"))] let status = if security::seccomp::check_is_fully_enabled() { 0 } else { -1 }; - #[cfg(not(target_os = "linux"))] + #[cfg(not(all(target_os = "linux", target_arch = "x86_64")))] let status = -1; std::process::exit(status) }, @@ -336,7 +336,9 @@ pub fn worker_event_loop( } } - #[cfg(target_os = "linux")] + // TODO: We can enable the seccomp networking blacklist on aarch64 as well, but we need a CI + // job to catch regressions. See . + #[cfg(all(target_os = "linux", target_arch = "x86_64"))] if security_status.can_enable_seccomp { let seccomp_status = security::seccomp::enable_for_worker(worker_kind, worker_pid, &worker_dir_path); diff --git a/polkadot/node/core/pvf/common/src/worker/security/mod.rs b/polkadot/node/core/pvf/common/src/worker/security/mod.rs index 63c5bb603120..9a38ed172773 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/mod.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/mod.rs @@ -47,36 +47,11 @@ pub fn unshare_user_namespace_and_change_root( ) -> Result<(), String> { use std::{env, ffi::CString, os::unix::ffi::OsStrExt, path::Path, ptr}; - // The following was copied from the `cstr_core` crate. - // // TODO: Remove this once this is stable: https://github.com/rust-lang/rust/issues/105723 - #[inline] - #[doc(hidden)] - const fn cstr_is_valid(bytes: &[u8]) -> bool { - if bytes.is_empty() || bytes[bytes.len() - 1] != 0 { - return false - } - - let mut index = 0; - while index < bytes.len() - 1 { - if bytes[index] == 0 { - return false - } - index += 1; - } - true - } - - macro_rules! cstr { - ($e:expr) => {{ - const STR: &[u8] = concat!($e, "\0").as_bytes(); - const STR_VALID: bool = cstr_is_valid(STR); - let _ = [(); 0 - (!(STR_VALID) as usize)]; - #[allow(unused_unsafe)] - unsafe { - core::ffi::CStr::from_bytes_with_nul_unchecked(STR) - } - }} + macro_rules! cstr_ptr { + ($e:expr) => { + concat!($e, "\0").as_ptr().cast::() + }; } gum::trace!( @@ -113,7 +88,7 @@ pub fn unshare_user_namespace_and_change_root( // the initial mount namespace. if libc::mount( ptr::null(), - cstr!("/").as_ptr(), + cstr_ptr!("/"), ptr::null(), libc::MS_REC | libc::MS_PRIVATE, ptr::null(), @@ -146,10 +121,10 @@ pub fn unshare_user_namespace_and_change_root( if libc::chdir(worker_dir_path_c.as_ptr()) < 0 { return Err("chdir to worker dir path") } - if libc::syscall(libc::SYS_pivot_root, cstr!(".").as_ptr(), cstr!(".").as_ptr()) < 0 { + if libc::syscall(libc::SYS_pivot_root, cstr_ptr!("."), cstr_ptr!(".")) < 0 { return Err("pivot_root") } - if libc::umount2(cstr!(".").as_ptr(), libc::MNT_DETACH) < 0 { + if libc::umount2(cstr_ptr!("."), libc::MNT_DETACH) < 0 { return Err("umount the old root mount point") } } From f792d88365aaf5c35f14d133b338fc3423d3bbe2 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Thu, 26 Oct 2023 13:25:58 +0200 Subject: [PATCH 07/12] Move security checks to a separate module --- polkadot/node/core/pvf/src/host.rs | 157 ++--------------------- polkadot/node/core/pvf/src/lib.rs | 1 + polkadot/node/core/pvf/src/security.rs | 164 +++++++++++++++++++++++++ 3 files changed, 172 insertions(+), 150 deletions(-) create mode 100644 polkadot/node/core/pvf/src/security.rs diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index dea9fee27ad3..450be69f5c58 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -24,7 +24,7 @@ use crate::{ artifacts::{ArtifactId, ArtifactPathId, ArtifactState, Artifacts}, execute::{self, PendingExecutionRequest}, metrics::Metrics, - prepare, Priority, ValidationError, LOG_TARGET, + prepare, security, Priority, ValidationError, LOG_TARGET, }; use always_assert::never; use futures::{ @@ -205,10 +205,13 @@ pub async fn start(config: Config, metrics: Metrics) -> (ValidationHost, impl Fu // Run checks for supported security features once per host startup. Warn here if not enabled. let security_status = { + // TODO: add check that syslog is available and that seccomp violations are logged? let (can_enable_landlock, can_enable_seccomp, can_unshare_user_namespace_and_change_root) = join!( - check_landlock(&config.prepare_worker_program_path), - check_seccomp(&config.prepare_worker_program_path), - check_can_unshare_user_namespace_and_change_root(&config.prepare_worker_program_path) + security::check_landlock(&config.prepare_worker_program_path), + security::check_seccomp(&config.prepare_worker_program_path), + security::check_can_unshare_user_namespace_and_change_root( + &config.prepare_worker_program_path + ) ); SecurityStatus { can_enable_landlock, @@ -888,152 +891,6 @@ fn pulse_every(interval: std::time::Duration) -> impl futures::Stream .map(|_| ()) } -/// Check if we can sandbox the root and emit a warning if not. -/// -/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible -/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on -/// success and -1 on failure. -async fn check_can_unshare_user_namespace_and_change_root( - #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] - prepare_worker_program_path: &Path, -) -> bool { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - match tokio::process::Command::new(prepare_worker_program_path) - .arg("--check-can-unshare-user-namespace-and-change-root") - .output() - .await - { - Ok(output) if output.status.success() => true, - Ok(output) => { - let stderr = std::str::from_utf8(&output.stderr) - .expect("child process writes a UTF-8 string to stderr; qed") - .trim(); - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - // Docs say to always print status using `Display` implementation. - status = %output.status, - %stderr, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, - } - } else { - gum::warn!( - target: LOG_TARGET, - "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." - ); - false - } - } -} - -/// Check if landlock is supported and emit a warning if not. -/// -/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible -/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on -/// success and -1 on failure. -async fn check_landlock( - #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] - prepare_worker_program_path: &Path, -) -> bool { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - match tokio::process::Command::new(prepare_worker_program_path) - .arg("--check-can-enable-landlock") - .status() - .await - { - Ok(status) if status.success() => true, - Ok(status) => { - let abi = - polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - ?status, - %abi, - "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, - } - } else { - gum::warn!( - target: LOG_TARGET, - "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." - ); - false - } - } -} - -/// Check if seccomp is supported and emit a warning if not. -/// -/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible -/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on -/// success and -1 on failure. -async fn check_seccomp( - #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] - prepare_worker_program_path: &Path, -) -> bool { - cfg_if::cfg_if! { - if #[cfg(target_os = "linux")] { - match tokio::process::Command::new(prepare_worker_program_path) - .arg("--check-can-enable-seccomp") - .status() - .await - { - Ok(status) if status.success() => true, - Ok(status) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - ?status, - "Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." - ); - false - }, - Err(err) => { - gum::warn!( - target: LOG_TARGET, - ?prepare_worker_program_path, - "Could not start child process: {}", - err - ); - false - }, - } - } else { - gum::warn!( - target: LOG_TARGET, - "Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security." - ); - false - } - } -} - #[cfg(test)] pub(crate) mod tests { use super::*; diff --git a/polkadot/node/core/pvf/src/lib.rs b/polkadot/node/core/pvf/src/lib.rs index 27630af40c2f..102a91dbdad7 100644 --- a/polkadot/node/core/pvf/src/lib.rs +++ b/polkadot/node/core/pvf/src/lib.rs @@ -97,6 +97,7 @@ mod host; mod metrics; mod prepare; mod priority; +mod security; mod worker_intf; #[cfg(feature = "test-utils")] diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs new file mode 100644 index 000000000000..4b9b1f077474 --- /dev/null +++ b/polkadot/node/core/pvf/src/security.rs @@ -0,0 +1,164 @@ +// Copyright (C) Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::LOG_TARGET; +use std::path::Path; + +/// Check if we can sandbox the root and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_can_unshare_user_namespace_and_change_root( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-unshare-user-namespace-and-change-root") + .output() + .await + { + Ok(output) if output.status.success() => true, + Ok(output) => { + let stderr = std::str::from_utf8(&output.stderr) + .expect("child process writes a UTF-8 string to stderr; qed") + .trim(); + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + // Docs say to always print status using `Display` implementation. + status = %output.status, + %stderr, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running with support for unsharing user namespaces for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot unshare user namespace and change root, which are Linux-specific kernel security features. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with support for unsharing user namespaces for maximum security." + ); + false + } + } +} + +/// Check if landlock is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_landlock( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-landlock") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + let abi = + polkadot_node_core_pvf_common::worker::security::landlock::LANDLOCK_ABI as u8; + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + %abi, + "Cannot fully enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable landlock, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with landlock support for maximum security." + ); + false + } + } +} + +/// Check if seccomp is supported and emit a warning if not. +/// +/// We do this check by spawning a new process and trying to sandbox it. To get as close as possible +/// to running the check in a worker, we try it... in a worker. The expected return status is 0 on +/// success and -1 on failure. +pub async fn check_seccomp( + #[cfg_attr(not(target_os = "linux"), allow(unused_variables))] + prepare_worker_program_path: &Path, +) -> bool { + cfg_if::cfg_if! { + if #[cfg(target_os = "linux")] { + match tokio::process::Command::new(prepare_worker_program_path) + .arg("--check-can-enable-seccomp") + .status() + .await + { + Ok(status) if status.success() => true, + Ok(status) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + ?status, + "Cannot fully enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider upgrading the kernel version for maximum security." + ); + false + }, + Err(err) => { + gum::warn!( + target: LOG_TARGET, + ?prepare_worker_program_path, + "Could not start child process: {}", + err + ); + false + }, + } + } else { + gum::warn!( + target: LOG_TARGET, + "Cannot enable seccomp, a Linux-specific kernel security feature. Running validation of malicious PVF code has a higher risk of compromising this machine. Consider running on Linux with seccomp support for maximum security." + ); + false + } + } +} From 99cb3b2a0e654c959e596738c26491bcc60898e4 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 27 Oct 2023 11:46:40 +0200 Subject: [PATCH 08/12] Make best-effort attempt to check audit logs for seccomp violations NOTE: Log, but don't change the outcome. Not all validators may have auditing enabled, so we don't want attackers to abuse a non-deterministic outcome. TESTING: Some manual testing where seccomp events were triggered confirmed that the logs are parsed correctly: Prepare worker: ``` Oct 27 09:15:42.725 WARN parachain::pvf: failed to recv a prepare response: Custom { kind: UnexpectedEof, error: "early eof" } worker_pid=2691819 Oct 27 09:15:42.726 DEBUG parachain::pvf: checking audit log for seccomp violations worker_pid=2691819 audit_log_path="/var/log/syslog" Oct 27 09:15:42.727 ERROR parachain::pvf: A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP! worker_pid=2691819 syscall=41 pvf=Pvf { code, code_hash: 0xdc4dc649bdebd4a529b6c3eb17da74b36afa9728eb7a0d834443510382490a35, executor_params: ExecutorParams([]), prep_timeout: 3s } ``` Execute worker: ``` Oct 27 09:17:28.006 WARN parachain::pvf: failed to recv an execute response worker_pid=2692712 validation_code_hash=0xdc4dc649bdebd4a529b6c3eb17da74b36afa9728eb7a0d834443510382490a35 error=Custom { kind: UnexpectedEof, error: "early eof" } Oct 27 09:17:28.006 DEBUG parachain::pvf: checking audit log for seccomp violations worker_pid=2692712 audit_log_path="/var/log/syslog" Oct 27 09:17:28.007 ERROR parachain::pvf: A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP! worker_pid=2692712 syscall=41 validation_code_hash=0xdc4dc649bdebd4a529b6c3eb17da74b36afa9728eb7a0d834443510382490a35 artifact_path="/tmp/.tmp4p6bN0/wasmtime_polkadot_v1.1.0_0xdc4dc649bdebd4a529b6c3eb17da74b36afa9728eb7a0d834443510382490a35_0x03170a2e7597b7b7e3d84c05391d139a62b157e78786d8c082f29dcf4c111314" ``` --- .../node/core/pvf/src/execute/worker_intf.rs | 21 ++- .../node/core/pvf/src/prepare/worker_intf.rs | 20 ++- polkadot/node/core/pvf/src/security.rs | 148 ++++++++++++++++++ polkadot/node/core/pvf/src/worker_intf.rs | 2 +- polkadot/node/core/pvf/tests/it/main.rs | 65 +++++++- 5 files changed, 251 insertions(+), 5 deletions(-) diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 783c7c7abbc8..8bb09916c0f8 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -18,6 +18,7 @@ use crate::{ artifacts::ArtifactPathId, + security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -106,7 +107,7 @@ pub enum Outcome { /// returns the outcome. /// /// NOTE: Not returning the idle worker token in `Outcome` will trigger the child process being -/// killed. +/// killed, if it's still alive. pub async fn start_work( worker: IdleWorker, artifact: ArtifactPathId, @@ -124,7 +125,10 @@ pub async fn start_work( artifact.path.display(), ); + let artifact_path = artifact.path.clone(); with_worker_dir_setup(worker_dir, pid, &artifact.path, |worker_dir| async move { + let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; + if let Err(error) = send_request(&mut stream, &validation_params, execution_timeout).await { gum::warn!( target: LOG_TARGET, @@ -153,6 +157,21 @@ pub async fn start_work( ?error, "failed to recv an execute response", ); + // The worker died. Check if it was due to a seccomp violation. + if let Some(syscall) = security::check_seccomp_violation_for_worker(audit_log_file, pid).await { + // NOTE: Log, but don't change the outcome. Not all validators may have + // auditing enabled, so we don't want attackers to abuse a + // non-deterministic outcome. + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + validation_code_hash = ?artifact.id.code_hash, + ?artifact_path, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + return Outcome::IoErr }, Ok(response) => { diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index b66c36044343..9fbc7c81774f 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -18,6 +18,7 @@ use crate::{ metrics::Metrics, + security, worker_intf::{ clear_worker_dir_path, framed_recv, framed_send, spawn_with_program_path, IdleWorker, SpawnErr, WorkerDir, WorkerHandle, JOB_TIMEOUT_WALL_CLOCK_FACTOR, @@ -126,7 +127,9 @@ pub async fn start_work( pid, |tmp_artifact_file, mut stream, worker_dir| async move { let preparation_timeout = pvf.prep_timeout(); - if let Err(err) = send_request(&mut stream, pvf).await { + let audit_log_file = security::AuditLogFile::try_open_and_seek_to_end().await; + + if let Err(err) = send_request(&mut stream, pvf.clone()).await { gum::warn!( target: LOG_TARGET, worker_pid = %pid, @@ -169,6 +172,21 @@ pub async fn start_work( "failed to recv a prepare response: {:?}", err, ); + + // The worker died. Check if it was due to a seccomp violation. + if let Some(syscall) = security::check_seccomp_violation_for_worker(audit_log_file, pid).await { + // NOTE: Log, but don't change the outcome. Not all validators may have + // auditing enabled, so we don't want attackers to abuse a non-deterministic + // outcome. + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + ?pvf, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + Outcome::IoErr(err.to_string()) }, Err(_) => { diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index 4b9b1f077474..7656477deacd 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -16,6 +16,10 @@ use crate::LOG_TARGET; use std::path::Path; +use tokio::{ + fs::{File, OpenOptions}, + io::{AsyncReadExt, AsyncSeekExt, SeekFrom}, +}; /// Check if we can sandbox the root and emit a warning if not. /// @@ -162,3 +166,147 @@ pub async fn check_seccomp( } } } + +const AUDIT_LOG_PATH: &'static str = "/var/log/audit/audit.log"; +const SYSLOG_PATH: &'static str = "/var/log/syslog"; + +/// System audit log. +pub struct AuditLogFile { + file: File, + path: &'static str, +} + +impl AuditLogFile { + /// Looks for an audit log file on the system and opens it, seeking to the end to skip any + /// events from before this was called. + /// + /// A bit of a verbose name, but it should clue future refactorers not to move calls closer to + /// where the `AuditLogFile` is used. + pub async fn try_open_and_seek_to_end() -> Option { + let mut path = AUDIT_LOG_PATH; + let mut file = match OpenOptions::new().read(true).open(AUDIT_LOG_PATH).await { + Ok(file) => Ok(file), + Err(_) => { + path = SYSLOG_PATH; + OpenOptions::new().read(true).open(SYSLOG_PATH).await + }, + } + .ok()?; + + let _pos = file.seek(SeekFrom::End(0)).await; + + Some(Self { file, path }) + } + + async fn read_new_since_open(mut self) -> String { + let mut buf = String::new(); + let _len = self.file.read_to_string(&mut buf).await; + buf + } +} + +/// Check if a seccomp violation occurred for the given worker. As the syslog may be in a different +/// location, or seccomp auditing may be disabled, this function provides a best-effort attempt +/// only. +/// +/// The `audit_log_file` must have been obtained before the job started. It only allows reading +/// entries that were written since it was obtained, so that we do not consider events from previous +/// processes with the same pid. This can still be racy, but it's unlikely and fine for a +/// best-effort attempt. +pub async fn check_seccomp_violation_for_worker( + audit_log_file: Option, + worker_pid: u32, +) -> Option { + let audit_event_pid_field = format!("pid={worker_pid}"); + + let audit_log_file = match audit_log_file { + Some(file) => { + gum::debug!( + target: LOG_TARGET, + %worker_pid, + audit_log_path = ?file.path, + "checking audit log for seccomp violations", + ); + file + }, + None => { + gum::warn!( + target: LOG_TARGET, + %worker_pid, + "could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs" + ); + return None + }, + }; + let events = audit_log_file.read_new_since_open().await; + + for event in events.lines() { + let res = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field); + if res.is_some() { + return res + } + } + + None +} + +fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option { + const SECCOMP_AUDIT_EVENT_TYPE: &'static str = "type=1326"; + + // Do a series of simple .contains instead of a regex, because I'm not sure if the fields are + // guaranteed to always be in the same order. + if !event.contains(SECCOMP_AUDIT_EVENT_TYPE) || !event.contains(&audit_event_pid_field) { + return None + } + + // Get the syscall. Let's avoid a dependency on regex just for this. + for field in event.split(" ") { + if let Some(syscall) = field.strip_prefix("syscall=") { + return syscall.parse::().ok() + } + } + + None +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_audit_log_for_seccomp_event() { + let audit_event_pid_field = "pid=2559058"; + + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559058 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + Some(53) + ); + // pid is wrong + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1326 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + // type is wrong + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e syscall=53 compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + // no syscall field + assert_eq!( + parse_audit_log_for_seccomp_event( + r#"Oct 24 13:15:24 build kernel: [5883980.283910] audit: type=1327 audit(1698153324.786:23): auid=0 uid=0 gid=0 ses=2162 subj=unconfined pid=2559057 comm="polkadot-prepar" exe="/root/paritytech/polkadot-sdk-2/target/debug/polkadot-prepare-worker" sig=31 arch=c000003e compat=0 ip=0x7f7542c80d5e code=0x80000000"#, + audit_event_pid_field + ), + None + ); + } +} diff --git a/polkadot/node/core/pvf/src/worker_intf.rs b/polkadot/node/core/pvf/src/worker_intf.rs index 39096cad7d64..8f9a7de354b8 100644 --- a/polkadot/node/core/pvf/src/worker_intf.rs +++ b/polkadot/node/core/pvf/src/worker_intf.rs @@ -245,7 +245,7 @@ pub enum SpawnErr { /// has been terminated. Since the worker is running in another process it is obviously not /// necessary to poll this future to make the worker run, it's only for termination detection. /// -/// This future relies on the fact that a child process's stdout `fd` is closed upon it's +/// This future relies on the fact that a child process's stdout `fd` is closed upon its /// termination. #[pin_project] pub struct WorkerHandle { diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index b4692ea05371..f3deec5992c0 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -14,7 +14,6 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -#[cfg(feature = "ci-only-tests")] use assert_matches::assert_matches; use parity_scale_codec::Encode as _; use polkadot_node_core_pvf::{ @@ -28,7 +27,7 @@ use polkadot_primitives::ExecutorParams; #[cfg(feature = "ci-only-tests")] use polkadot_primitives::ExecutorParam; -use std::time::Duration; +use std::{process::Command, time::Duration}; use tokio::sync::Mutex; mod adder; @@ -153,6 +152,68 @@ async fn terminates_on_timeout() { assert!(duration < TEST_EXECUTION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR); } +// What happens when the prepare worker dies in the middle of a job? +// +// To avoid interfering with other running tests or processes, this test is ignored and must be run +// by itself: +// $ cargo test prepare_worker_killed_during_job -- --include-ignored +#[ignore] +#[tokio::test] +async fn prepare_worker_killed_during_job() { + let host = TestHost::new().await; + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; + Command::new("killall").args(["-9", "polkadot-prepare-worker"]).spawn().unwrap(); + } + ); + + assert_matches!(result, Err(PrepareError::IoErr(_))); +} + +// What happens when the execute worker dies in the middle of a job? +// +// To avoid interfering with other running tests or processes, this test is ignored and must be run +// by itself: +// $ cargo test execute_worker_killed_during_job -- --include-ignored +#[ignore] +#[tokio::test] +async fn execute_worker_killed_during_job() { + let host = TestHost::new().await; + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose an job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; + Command::new("killall").args(["-9", "polkadot-execute-worker"]).spawn().unwrap(); + } + ); + + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) + ); +} + #[cfg(feature = "ci-only-tests")] #[tokio::test] async fn ensure_parallel_execution() { From 38012ec8dc5125da3d56c36213dc222c0025834c Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Fri, 27 Oct 2023 11:59:27 +0200 Subject: [PATCH 09/12] Update doc --- .../node/core/pvf/common/src/worker/security/seccomp.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs index de719d93e31d..c3822d3c4c69 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -65,9 +65,11 @@ //! Considering how many things would have to go wrong here, we believe it's safe to block //! `io_uring`. //! -//! # Action on caught syscall +//! # Action on syscall violations //! -//! TODO +//! When a forbidden syscall is attempted we immediately kill the process in order to prevent the +//! attacker from doing anything else. In execution, this will result in voting against the +//! candidate. use crate::{ worker::{stringify_panic_payload, WorkerKind}, From c9b0aaae80936a4fe557f0efbf40da203b0da8f6 Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Mon, 30 Oct 2023 10:42:35 +0100 Subject: [PATCH 10/12] Log violations, don't kill process yet --- .../pvf/common/src/worker/security/seccomp.rs | 10 +++++--- .../node/core/pvf/src/execute/worker_intf.rs | 22 +++++++++++++--- .../node/core/pvf/src/prepare/worker_intf.rs | 25 ++++++++++++++----- polkadot/node/core/pvf/src/security.rs | 14 +++++------ 4 files changed, 50 insertions(+), 21 deletions(-) diff --git a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs index c3822d3c4c69..5539ad284400 100644 --- a/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs +++ b/polkadot/node/core/pvf/common/src/worker/security/seccomp.rs @@ -67,9 +67,11 @@ //! //! # Action on syscall violations //! -//! When a forbidden syscall is attempted we immediately kill the process in order to prevent the -//! attacker from doing anything else. In execution, this will result in voting against the -//! candidate. +//! On syscall violations we currently only log, to make sure this works correctly before enforcing. +//! +//! In the future, when a forbidden syscall is attempted we immediately kill the process in order to +//! prevent the attacker from doing anything else. In execution, this will result in voting against +//! the candidate. use crate::{ worker::{stringify_panic_payload, WorkerKind}, @@ -80,7 +82,7 @@ use std::{collections::BTreeMap, path::Path}; /// The action to take on caught syscalls. #[cfg(not(test))] -const CAUGHT_ACTION: SeccompAction = SeccompAction::KillProcess; +const CAUGHT_ACTION: SeccompAction = SeccompAction::Log; /// Don't kill the process when testing. #[cfg(test)] const CAUGHT_ACTION: SeccompAction = SeccompAction::Errno(libc::EACCES as u32); diff --git a/polkadot/node/core/pvf/src/execute/worker_intf.rs b/polkadot/node/core/pvf/src/execute/worker_intf.rs index 8bb09916c0f8..61264f7d517d 100644 --- a/polkadot/node/core/pvf/src/execute/worker_intf.rs +++ b/polkadot/node/core/pvf/src/execute/worker_intf.rs @@ -158,10 +158,11 @@ pub async fn start_work( "failed to recv an execute response", ); // The worker died. Check if it was due to a seccomp violation. - if let Some(syscall) = security::check_seccomp_violation_for_worker(audit_log_file, pid).await { - // NOTE: Log, but don't change the outcome. Not all validators may have - // auditing enabled, so we don't want attackers to abuse a - // non-deterministic outcome. + // + // NOTE: Log, but don't change the outcome. Not all validators may have + // auditing enabled, so we don't want attackers to abuse a non-deterministic + // outcome. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { gum::error!( target: LOG_TARGET, worker_pid = %pid, @@ -175,6 +176,19 @@ pub async fn start_work( return Outcome::IoErr }, Ok(response) => { + // Check if any syscall violations occurred during the job. For now this is + // only informative, as we are not enforcing the seccomp policy yet. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + validation_code_hash = ?artifact.id.code_hash, + ?artifact_path, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + if let Response::Ok{duration, ..} = response { if duration > execution_timeout { // The job didn't complete within the timeout. diff --git a/polkadot/node/core/pvf/src/prepare/worker_intf.rs b/polkadot/node/core/pvf/src/prepare/worker_intf.rs index 9fbc7c81774f..1f6ab0b76556 100644 --- a/polkadot/node/core/pvf/src/prepare/worker_intf.rs +++ b/polkadot/node/core/pvf/src/prepare/worker_intf.rs @@ -153,7 +153,19 @@ pub async fn start_work( match result { // Received bytes from worker within the time limit. - Ok(Ok(prepare_result)) => + Ok(Ok(prepare_result)) => { + // Check if any syscall violations occurred during the job. For now this is only + // informative, as we are not enforcing the seccomp policy yet. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { + gum::error!( + target: LOG_TARGET, + worker_pid = %pid, + %syscall, + ?pvf, + "A forbidden syscall was attempted! This is a violation of our seccomp security policy. Report an issue ASAP!" + ); + } + handle_response( metrics, IdleWorker { stream, pid, worker_dir }, @@ -163,7 +175,8 @@ pub async fn start_work( artifact_path, preparation_timeout, ) - .await, + .await + }, Ok(Err(err)) => { // Communication error within the time limit. gum::warn!( @@ -174,10 +187,10 @@ pub async fn start_work( ); // The worker died. Check if it was due to a seccomp violation. - if let Some(syscall) = security::check_seccomp_violation_for_worker(audit_log_file, pid).await { - // NOTE: Log, but don't change the outcome. Not all validators may have - // auditing enabled, so we don't want attackers to abuse a non-deterministic - // outcome. + // + // NOTE: Log, but don't change the outcome. Not all validators may have auditing + // enabled, so we don't want attackers to abuse a non-deterministic outcome. + for syscall in security::check_seccomp_violations_for_worker(audit_log_file, pid).await { gum::error!( target: LOG_TARGET, worker_pid = %pid, diff --git a/polkadot/node/core/pvf/src/security.rs b/polkadot/node/core/pvf/src/security.rs index 7656477deacd..decd321e415e 100644 --- a/polkadot/node/core/pvf/src/security.rs +++ b/polkadot/node/core/pvf/src/security.rs @@ -213,10 +213,10 @@ impl AuditLogFile { /// entries that were written since it was obtained, so that we do not consider events from previous /// processes with the same pid. This can still be racy, but it's unlikely and fine for a /// best-effort attempt. -pub async fn check_seccomp_violation_for_worker( +pub async fn check_seccomp_violations_for_worker( audit_log_file: Option, worker_pid: u32, -) -> Option { +) -> Vec { let audit_event_pid_field = format!("pid={worker_pid}"); let audit_log_file = match audit_log_file { @@ -235,19 +235,19 @@ pub async fn check_seccomp_violation_for_worker( %worker_pid, "could not open either {AUDIT_LOG_PATH} or {SYSLOG_PATH} for reading audit logs" ); - return None + return vec![] }, }; let events = audit_log_file.read_new_since_open().await; + let mut violations = vec![]; for event in events.lines() { - let res = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field); - if res.is_some() { - return res + if let Some(syscall) = parse_audit_log_for_seccomp_event(event, &audit_event_pid_field) { + violations.push(syscall); } } - None + violations } fn parse_audit_log_for_seccomp_event(event: &str, audit_event_pid_field: &str) -> Option { From 54178182b900c66fc7b7da2f5f80f09ea810383f Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Mon, 30 Oct 2023 12:55:56 +0100 Subject: [PATCH 11/12] Un-ignore tests Use a combination of rusty-fork (separate processes in rust tests) and new sessions to safely kill child workers in tests. --- Cargo.lock | 49 ++++++-- polkadot/node/core/pvf/Cargo.toml | 5 + polkadot/node/core/pvf/src/host.rs | 4 + polkadot/node/core/pvf/tests/it/main.rs | 156 +++++++++++++++--------- 4 files changed, 147 insertions(+), 67 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 735d2bf08df4..117c00abe9ff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5614,7 +5614,7 @@ version = "0.6.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2eeb4ed9e12f43b7fa0baae3f9cdda28352770132ef2e09a23760c29cae8bd47" dependencies = [ - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] @@ -6568,7 +6568,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b" dependencies = [ "hermit-abi 0.3.2", - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] @@ -7021,9 +7021,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" [[package]] name = "libc" -version = "0.2.147" +version = "0.2.149" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +checksum = "a08173bc88b7955d1b3145aa561539096c421ac8debde8cbc3612ec635fee29b" [[package]] name = "libflate" @@ -7607,9 +7607,9 @@ checksum = "ef53942eb7bf7ff43a617b3e2c1c4a5ecf5944a7c1bc12d7ee39bbb15e5c1519" [[package]] name = "linux-raw-sys" -version = "0.4.5" +version = "0.4.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "57bcfdad1b858c2db7c38303a6d2ad4dfaf5eb53dfeb0910128b2c26d6158503" +checksum = "da2479e8c062e40bf0066ffa0bc823de0a9368974af99c9f6df941d2c231e03f" [[package]] name = "lioness" @@ -12201,8 +12201,10 @@ dependencies = [ "polkadot-node-primitives", "polkadot-parachain-primitives", "polkadot-primitives", + "procfs", "rand 0.8.5", "rococo-runtime", + "rusty-fork", "slotmap", "sp-core", "sp-maybe-compressed-blob", @@ -13446,6 +13448,32 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "procfs" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "731e0d9356b0c25f16f33b5be79b1c57b562f141ebfcdb0ad8ac2c13a24293b4" +dependencies = [ + "bitflags 2.4.0", + "chrono", + "flate2", + "hex", + "lazy_static", + "procfs-core", + "rustix 0.38.21", +] + +[[package]] +name = "procfs-core" +version = "0.16.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d3554923a69f4ce04c4a754260c338f505ce22642d3830e049a399fc2059a29" +dependencies = [ + "bitflags 2.4.0", + "chrono", + "hex", +] + [[package]] name = "prometheus" version = "0.13.3" @@ -14373,14 +14401,14 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.8" +version = "0.38.21" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "19ed4fa021d81c8392ce04db050a3da9a60299050b7ae1cf482d862b54a7218f" +checksum = "2b426b0506e5d50a7d8dafcf2e81471400deb602392c7dd110815afb4eaf02a3" dependencies = [ "bitflags 2.4.0", "errno", "libc", - "linux-raw-sys 0.4.5", + "linux-raw-sys 0.4.10", "windows-sys 0.48.0", ] @@ -14477,6 +14505,7 @@ dependencies = [ "fnv", "quick-error", "tempfile", + "wait-timeout", ] [[package]] @@ -18333,7 +18362,7 @@ dependencies = [ "cfg-if", "fastrand 2.0.0", "redox_syscall 0.3.5", - "rustix 0.38.8", + "rustix 0.38.21", "windows-sys 0.48.0", ] diff --git a/polkadot/node/core/pvf/Cargo.toml b/polkadot/node/core/pvf/Cargo.toml index bfd70c6fbd41..430f7cd5e8ef 100644 --- a/polkadot/node/core/pvf/Cargo.toml +++ b/polkadot/node/core/pvf/Cargo.toml @@ -39,6 +39,7 @@ polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = tr assert_matches = "1.4.0" criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support", "async_tokio"] } hex-literal = "0.4.1" + polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] } # For benches and integration tests, depend on ourselves with the test-utils # feature. @@ -48,6 +49,10 @@ rococo-runtime = { path = "../../../runtime/rococo" } adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" } halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" } +[target.'cfg(target_os = "linux")'.dev-dependencies] +procfs = "0.16.0" +rusty-fork = "0.3.0" + [[bench]] name = "host_prepare_rococo_runtime" harness = false diff --git a/polkadot/node/core/pvf/src/host.rs b/polkadot/node/core/pvf/src/host.rs index 450be69f5c58..dd0bd8581985 100644 --- a/polkadot/node/core/pvf/src/host.rs +++ b/polkadot/node/core/pvf/src/host.rs @@ -153,6 +153,7 @@ pub struct Config { pub cache_path: PathBuf, /// The version of the node. `None` can be passed to skip the version check (only for tests). pub node_version: Option, + /// The path to the program that can be used to spawn the prepare workers. pub prepare_worker_program_path: PathBuf, /// The time allotted for a prepare worker to spawn and report to the host. @@ -162,6 +163,7 @@ pub struct Config { pub prepare_workers_soft_max_num: usize, /// The absolute number of workers that can be spawned in the prepare pool. pub prepare_workers_hard_max_num: usize, + /// The path to the program that can be used to spawn the execute workers. pub execute_worker_program_path: PathBuf, /// The time allotted for an execute worker to spawn and report to the host. @@ -181,10 +183,12 @@ impl Config { Self { cache_path, node_version, + prepare_worker_program_path, prepare_worker_spawn_timeout: Duration::from_secs(3), prepare_workers_soft_max_num: 1, prepare_workers_hard_max_num: 1, + execute_worker_program_path, execute_worker_spawn_timeout: Duration::from_secs(3), execute_workers_max_num: 2, diff --git a/polkadot/node/core/pvf/tests/it/main.rs b/polkadot/node/core/pvf/tests/it/main.rs index f3deec5992c0..a69a488adb98 100644 --- a/polkadot/node/core/pvf/tests/it/main.rs +++ b/polkadot/node/core/pvf/tests/it/main.rs @@ -23,11 +23,13 @@ use polkadot_node_core_pvf::{ }; use polkadot_parachain_primitives::primitives::{BlockData, ValidationParams, ValidationResult}; use polkadot_primitives::ExecutorParams; +#[cfg(target_os = "linux")] +use rusty_fork::rusty_fork_test; #[cfg(feature = "ci-only-tests")] use polkadot_primitives::ExecutorParam; -use std::{process::Command, time::Duration}; +use std::time::Duration; use tokio::sync::Mutex; mod adder; @@ -152,66 +154,106 @@ async fn terminates_on_timeout() { assert!(duration < TEST_EXECUTION_TIMEOUT * JOB_TIMEOUT_WALL_CLOCK_FACTOR); } -// What happens when the prepare worker dies in the middle of a job? -// -// To avoid interfering with other running tests or processes, this test is ignored and must be run -// by itself: -// $ cargo test prepare_worker_killed_during_job -- --include-ignored -#[ignore] -#[tokio::test] -async fn prepare_worker_killed_during_job() { - let host = TestHost::new().await; - - let (result, _) = futures::join!( - // Choose a job that would normally take the entire timeout. - host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), - // Run a future that kills the job in the middle of the timeout. - async { - tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; - Command::new("killall").args(["-9", "polkadot-prepare-worker"]).spawn().unwrap(); +#[cfg(target_os = "linux")] +fn kill_by_sid_and_name(sid: i32, exe_name: &'static str) { + use procfs::process; + + let all_processes: Vec = process::all_processes() + .expect("Can't read /proc") + .filter_map(|p| match p { + Ok(p) => Some(p), // happy path + Err(e) => match e { + // process vanished during iteration, ignore it + procfs::ProcError::NotFound(_) => None, + x => { + panic!("some unknown error: {}", x); + }, + }, + }) + .collect(); + + for process in all_processes { + if process.stat().unwrap().session == sid && + process.exe().unwrap().to_str().unwrap().contains(exe_name) + { + assert_eq!(unsafe { libc::kill(process.pid(), 9) }, 0); } - ); - - assert_matches!(result, Err(PrepareError::IoErr(_))); + } } -// What happens when the execute worker dies in the middle of a job? -// -// To avoid interfering with other running tests or processes, this test is ignored and must be run -// by itself: -// $ cargo test execute_worker_killed_during_job -- --include-ignored -#[ignore] -#[tokio::test] -async fn execute_worker_killed_during_job() { - let host = TestHost::new().await; - - // Prepare the artifact ahead of time. - let binary = halt::wasm_binary_unwrap(); - host.precheck_pvf(binary, Default::default()).await.unwrap(); - - let (result, _) = futures::join!( - // Choose an job that would normally take the entire timeout. - host.validate_candidate( - binary, - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - Default::default(), - ), - // Run a future that kills the job in the middle of the timeout. - async { - tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; - Command::new("killall").args(["-9", "polkadot-execute-worker"]).spawn().unwrap(); - } - ); +// Run these tests in their own processes with rusty-fork. They work by each creating a new session, +// then killing the worker process that matches the session ID and expected worker name. +#[cfg(target_os = "linux")] +rusty_fork_test! { + // What happens when the prepare worker dies in the middle of a job? + #[test] + fn prepare_worker_killed_during_job() { + const PROCESS_NAME: &'static str = "polkadot-prepare-worker"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + let (result, _) = futures::join!( + // Choose a job that would normally take the entire timeout. + host.precheck_pvf(rococo_runtime::WASM_BINARY.unwrap(), Default::default()), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_PREPARATION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, PROCESS_NAME); + } + ); + + assert_matches!(result, Err(PrepareError::IoErr(_))); + }) + } - assert_matches!( - result, - Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) - ); + // What happens when the execute worker dies in the middle of a job? + #[test] + fn execute_worker_killed_during_job() { + const PROCESS_NAME: &'static str = "polkadot-execute-worker"; + + let rt = tokio::runtime::Runtime::new().unwrap(); + rt.block_on(async { + let host = TestHost::new().await; + + // Create a new session and get the session ID. + let sid = unsafe { libc::setsid() }; + assert!(sid > 0); + + // Prepare the artifact ahead of time. + let binary = halt::wasm_binary_unwrap(); + host.precheck_pvf(binary, Default::default()).await.unwrap(); + + let (result, _) = futures::join!( + // Choose an job that would normally take the entire timeout. + host.validate_candidate( + binary, + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + Default::default(), + ), + // Run a future that kills the job in the middle of the timeout. + async { + tokio::time::sleep(TEST_EXECUTION_TIMEOUT / 2).await; + kill_by_sid_and_name(sid, PROCESS_NAME); + } + ); + + assert_matches!( + result, + Err(ValidationError::InvalidCandidate(InvalidCandidate::AmbiguousWorkerDeath)) + ); + }) + } } #[cfg(feature = "ci-only-tests")] From e89873ccaee89390421bc0c0d12203282413788d Mon Sep 17 00:00:00 2001 From: "Marcin S." Date: Tue, 31 Oct 2023 09:18:25 +0100 Subject: [PATCH 12/12] Comment out section in impl guide --- .../src/node/utility/pvf-host-and-workers.md | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md index 41ee02028b0d..0cefeb1f77ca 100644 --- a/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md +++ b/polkadot/roadmap/implementers-guide/src/node/utility/pvf-host-and-workers.md @@ -128,14 +128,16 @@ data on the host machine. *Currently this is only supported on Linux.* -### Restricting networking + -We also disable networking on PVF threads by disabling certain syscalls, such as -the creation of sockets. This prevents attackers from either downloading -payloads or communicating sensitive data from the validator's machine to the -outside world. + -*Currently this is only supported on Linux.* + + + + + + ### Clearing env vars