Skip to content

Commit

Permalink
runtime: do fewer syscalls in remap_append_vec_file (solana-labs#336)
Browse files Browse the repository at this point in the history
* runtime: do fewer syscalls in remap_append_vec_file

Use renameat2(src, dest, NOREPLACE) as an atomic version of if
statx(dest).is_err() { rename(src, dest) }.

We have high inode contention during storage rebuild and this saves 1
fs syscall for each appendvec.

* Address review feedback
  • Loading branch information
alessandrod authored Mar 21, 2024
1 parent 11aa06d commit 9818815
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 18 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions programs/sbf/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions runtime/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ im = { workspace = true, features = ["rayon", "serde"] }
index_list = { workspace = true }
itertools = { workspace = true }
lazy_static = { workspace = true }
libc = { workspace = true }
log = { workspace = true }
lru = { workspace = true }
lz4 = { workspace = true }
Expand Down
89 changes: 74 additions & 15 deletions runtime/src/serde_snapshot.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#[cfg(target_os = "linux")]
use std::ffi::{CStr, CString};
use {
crate::{
bank::{builtins::BuiltinPrototype, Bank, BankFieldsToDeserialize, BankRc},
Expand Down Expand Up @@ -655,38 +657,66 @@ pub(crate) fn reconstruct_single_storage(
)))
}

fn remap_append_vec_file(
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
// due to full snapshots and incremental snapshots generated from different
// nodes
pub(crate) fn remap_append_vec_file(
slot: Slot,
old_append_vec_id: SerializedAppendVecId,
append_vec_path: &Path,
next_append_vec_id: &AtomicAppendVecId,
num_collisions: &AtomicUsize,
) -> io::Result<(AppendVecId, PathBuf)> {
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
// due to full snapshots and incremental snapshots generated from different nodes
#[cfg(target_os = "linux")]
let append_vec_path_cstr = cstring_from_path(append_vec_path)?;

let mut remapped_append_vec_path = append_vec_path.to_path_buf();

// Break out of the loop in the following situations:
// 1. The new ID is the same as the original ID. This means we do not need to
// rename the file, since the ID is the "correct" one already.
// 2. There is not a file already at the new path. This means it is safe to
// rename the file to this new path.
let (remapped_append_vec_id, remapped_append_vec_path) = loop {
let remapped_append_vec_id = next_append_vec_id.fetch_add(1, Ordering::AcqRel);

// this can only happen in the first iteration of the loop
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId {
break (remapped_append_vec_id, remapped_append_vec_path);
}

let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id);
let remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);

// Break out of the loop in the following situations:
// 1. The new ID is the same as the original ID. This means we do not need to
// rename the file, since the ID is the "correct" one already.
// 2. There is not a file already at the new path. This means it is safe to
// rename the file to this new path.
// **DEVELOPER NOTE:** Keep this check last so that it can short-circuit if
// possible.
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId
|| std::fs::metadata(&remapped_append_vec_path).is_err()
remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);

#[cfg(target_os = "linux")]
{
let remapped_append_vec_path_cstr = cstring_from_path(&remapped_append_vec_path)?;

// On linux we use renameat2(NO_REPLACE) instead of IF metadata(path).is_err() THEN
// rename() in order to save a statx() syscall.
match rename_no_replace(&append_vec_path_cstr, &remapped_append_vec_path_cstr) {
// If the file was successfully renamed, break out of the loop
Ok(_) => break (remapped_append_vec_id, remapped_append_vec_path),
// If there's already a file at the new path, continue so we try
// the next ID
Err(e) if e.kind() == io::ErrorKind::AlreadyExists => {}
Err(e) => return Err(e),
}
}

#[cfg(not(target_os = "linux"))]
if std::fs::metadata(&remapped_append_vec_path).is_err() {
break (remapped_append_vec_id, remapped_append_vec_path);
}

// If we made it this far, a file exists at the new path. Record the collision
// and try again.
num_collisions.fetch_add(1, Ordering::Relaxed);
};
// Only rename the file if the new ID is actually different from the original.

// Only rename the file if the new ID is actually different from the original. In the target_os
// = linux case, we have already renamed if necessary.
#[cfg(not(target_os = "linux"))]
if old_append_vec_id != remapped_append_vec_id as SerializedAppendVecId {
std::fs::rename(append_vec_path, &remapped_append_vec_path)?;
}
Expand Down Expand Up @@ -953,3 +983,32 @@ where
ReconstructedAccountsDbInfo { accounts_data_len },
))
}

// Rename `src` to `dest` only if `dest` doesn't already exist.
#[cfg(target_os = "linux")]
fn rename_no_replace(src: &CStr, dest: &CStr) -> io::Result<()> {
let ret = unsafe {
libc::renameat2(
libc::AT_FDCWD,
src.as_ptr() as *const _,
libc::AT_FDCWD,
dest.as_ptr() as *const _,
libc::RENAME_NOREPLACE,
)
};
if ret == -1 {
return Err(io::Error::last_os_error());
}

Ok(())
}

#[cfg(target_os = "linux")]
fn cstring_from_path(path: &Path) -> io::Result<CString> {
// It is better to allocate here than use the stack. Jemalloc is going to give us a chunk of a
// preallocated small arena anyway. Instead if we used the stack since PATH_MAX=4096 it would
// result in LLVM inserting a stack probe, see
// https://docs.rs/compiler_builtins/latest/compiler_builtins/probestack/index.html.
CString::new(path.as_os_str().as_encoded_bytes())
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
}
63 changes: 60 additions & 3 deletions runtime/src/serde_snapshot/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ mod serde_snapshot_tests {
use {
crate::{
serde_snapshot::{
newer, reconstruct_accountsdb_from_fields, SerdeStyle, SerializableAccountsDb,
SnapshotAccountsDbFields, TypeContext,
newer, reconstruct_accountsdb_from_fields, remap_append_vec_file, SerdeStyle,
SerializableAccountsDb, SnapshotAccountsDbFields, TypeContext,
},
snapshot_utils::{get_storages_to_serialize, StorageAndNextAppendVecId},
},
Expand Down Expand Up @@ -34,12 +34,17 @@ mod serde_snapshot_tests {
rent_collector::RentCollector,
},
std::{
fs::File,
io::{BufReader, Cursor, Read, Write},
ops::RangeFull,
path::{Path, PathBuf},
sync::{atomic::Ordering, Arc},
sync::{
atomic::{AtomicUsize, Ordering},
Arc,
},
},
tempfile::TempDir,
test_case::test_case,
};

fn linear_ancestors(end_slot: u64) -> Ancestors {
Expand Down Expand Up @@ -845,4 +850,56 @@ mod serde_snapshot_tests {
);
}
}

// no remap needed
#[test_case(456, 456, 456, 0, |_| {})]
// remap from 456 to 457, no collisions
#[test_case(456, 457, 457, 0, |_| {})]
// attempt to remap from 456 to 457, but there's a collision, so we get 458
#[test_case(456, 457, 458, 1, |tmp| {
File::create(tmp.join("123.457")).unwrap();
})]
fn test_remap_append_vec_file(
old_id: usize,
next_id: usize,
expected_remapped_id: usize,
expected_collisions: usize,
become_ungovernable: impl FnOnce(&Path),
) {
let tmp = tempfile::tempdir().unwrap();
let old_path = tmp.path().join(format!("123.{old_id}"));
let expected_remapped_path = tmp.path().join(format!("123.{expected_remapped_id}"));
File::create(&old_path).unwrap();

become_ungovernable(tmp.path());

let next_append_vec_id = AtomicAppendVecId::new(next_id as u32);
let num_collisions = AtomicUsize::new(0);
let (remapped_id, remapped_path) =
remap_append_vec_file(123, old_id, &old_path, &next_append_vec_id, &num_collisions)
.unwrap();
assert_eq!(remapped_id as usize, expected_remapped_id);
assert_eq!(&remapped_path, &expected_remapped_path);
assert_eq!(num_collisions.load(Ordering::Relaxed), expected_collisions);
}

#[test]
#[should_panic(expected = "No such file or directory")]
fn test_remap_append_vec_file_error() {
let tmp = tempfile::tempdir().unwrap();
let original_path = tmp.path().join("123.456");

// In remap_append_vec() we want to handle EEXIST (collisions), but we want to return all
// other errors
let next_append_vec_id = AtomicAppendVecId::new(457);
let num_collisions = AtomicUsize::new(0);
remap_append_vec_file(
123,
456,
&original_path,
&next_append_vec_id,
&num_collisions,
)
.unwrap();
}
}

0 comments on commit 9818815

Please sign in to comment.