runtime: do fewer syscalls in remap_append_vec_file (#336)

* runtime: do fewer syscalls in remap_append_vec_file

Use renameat2(src, dest, NOREPLACE) as an atomic version of if
statx(dest).is_err() { rename(src, dest) }.

We have high inode contention during storage rebuild and this saves 1
fs syscall for each appendvec.

* Address review feedback
This commit is contained in:
Alessandro Decina 2024-03-21 14:28:23 +11:00 committed by GitHub
parent 11aa06d24f
commit 981881544c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 137 additions and 18 deletions

1
Cargo.lock generated
View File

@ -6997,6 +6997,7 @@ dependencies = [
"index_list",
"itertools",
"lazy_static",
"libc",
"libsecp256k1",
"log",
"lru",

View File

@ -5695,6 +5695,7 @@ dependencies = [
"index_list",
"itertools",
"lazy_static",
"libc",
"log",
"lru",
"lz4",

View File

@ -28,6 +28,7 @@ im = { workspace = true, features = ["rayon", "serde"] }
index_list = { workspace = true }
itertools = { workspace = true }
lazy_static = { workspace = true }
libc = { workspace = true }
log = { workspace = true }
lru = { workspace = true }
lz4 = { workspace = true }

View File

@ -1,3 +1,5 @@
#[cfg(target_os = "linux")]
use std::ffi::{CStr, CString};
use {
crate::{
bank::{builtins::BuiltinPrototype, Bank, BankFieldsToDeserialize, BankRc},
@ -655,30 +657,55 @@ pub(crate) fn reconstruct_single_storage(
)))
}
fn remap_append_vec_file(
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
// due to full snapshots and incremental snapshots generated from different
// nodes
pub(crate) fn remap_append_vec_file(
slot: Slot,
old_append_vec_id: SerializedAppendVecId,
append_vec_path: &Path,
next_append_vec_id: &AtomicAppendVecId,
num_collisions: &AtomicUsize,
) -> io::Result<(AppendVecId, PathBuf)> {
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
// due to full snapshots and incremental snapshots generated from different nodes
#[cfg(target_os = "linux")]
let append_vec_path_cstr = cstring_from_path(append_vec_path)?;
let mut remapped_append_vec_path = append_vec_path.to_path_buf();
// Break out of the loop in the following situations:
// 1. The new ID is the same as the original ID. This means we do not need to
// rename the file, since the ID is the "correct" one already.
// 2. There is not a file already at the new path. This means it is safe to
// rename the file to this new path.
let (remapped_append_vec_id, remapped_append_vec_path) = loop {
let remapped_append_vec_id = next_append_vec_id.fetch_add(1, Ordering::AcqRel);
let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id);
let remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);
// Break out of the loop in the following situations:
// 1. The new ID is the same as the original ID. This means we do not need to
// rename the file, since the ID is the "correct" one already.
// 2. There is not a file already at the new path. This means it is safe to
// rename the file to this new path.
// **DEVELOPER NOTE:** Keep this check last so that it can short-circuit if
// possible.
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId
|| std::fs::metadata(&remapped_append_vec_path).is_err()
// this can only happen in the first iteration of the loop
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId {
break (remapped_append_vec_id, remapped_append_vec_path);
}
let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id);
remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);
#[cfg(target_os = "linux")]
{
let remapped_append_vec_path_cstr = cstring_from_path(&remapped_append_vec_path)?;
// On linux we use renameat2(NO_REPLACE) instead of IF metadata(path).is_err() THEN
// rename() in order to save a statx() syscall.
match rename_no_replace(&append_vec_path_cstr, &remapped_append_vec_path_cstr) {
// If the file was successfully renamed, break out of the loop
Ok(_) => break (remapped_append_vec_id, remapped_append_vec_path),
// If there's already a file at the new path, continue so we try
// the next ID
Err(e) if e.kind() == io::ErrorKind::AlreadyExists => {}
Err(e) => return Err(e),
}
}
#[cfg(not(target_os = "linux"))]
if std::fs::metadata(&remapped_append_vec_path).is_err() {
break (remapped_append_vec_id, remapped_append_vec_path);
}
@ -686,7 +713,10 @@ fn remap_append_vec_file(
// and try again.
num_collisions.fetch_add(1, Ordering::Relaxed);
};
// Only rename the file if the new ID is actually different from the original.
// Only rename the file if the new ID is actually different from the original. In the target_os
// = linux case, we have already renamed if necessary.
#[cfg(not(target_os = "linux"))]
if old_append_vec_id != remapped_append_vec_id as SerializedAppendVecId {
std::fs::rename(append_vec_path, &remapped_append_vec_path)?;
}
@ -953,3 +983,32 @@ where
ReconstructedAccountsDbInfo { accounts_data_len },
))
}
// Rename `src` to `dest` only if `dest` doesn't already exist.
#[cfg(target_os = "linux")]
fn rename_no_replace(src: &CStr, dest: &CStr) -> io::Result<()> {
let ret = unsafe {
libc::renameat2(
libc::AT_FDCWD,
src.as_ptr() as *const _,
libc::AT_FDCWD,
dest.as_ptr() as *const _,
libc::RENAME_NOREPLACE,
)
};
if ret == -1 {
return Err(io::Error::last_os_error());
}
Ok(())
}
#[cfg(target_os = "linux")]
fn cstring_from_path(path: &Path) -> io::Result<CString> {
// It is better to allocate here than use the stack. Jemalloc is going to give us a chunk of a
// preallocated small arena anyway. Instead if we used the stack since PATH_MAX=4096 it would
// result in LLVM inserting a stack probe, see
// https://docs.rs/compiler_builtins/latest/compiler_builtins/probestack/index.html.
CString::new(path.as_os_str().as_encoded_bytes())
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
}

View File

@ -3,8 +3,8 @@ mod serde_snapshot_tests {
use {
crate::{
serde_snapshot::{
newer, reconstruct_accountsdb_from_fields, SerdeStyle, SerializableAccountsDb,
SnapshotAccountsDbFields, TypeContext,
newer, reconstruct_accountsdb_from_fields, remap_append_vec_file, SerdeStyle,
SerializableAccountsDb, SnapshotAccountsDbFields, TypeContext,
},
snapshot_utils::{get_storages_to_serialize, StorageAndNextAppendVecId},
},
@ -34,12 +34,17 @@ mod serde_snapshot_tests {
rent_collector::RentCollector,
},
std::{
fs::File,
io::{BufReader, Cursor, Read, Write},
ops::RangeFull,
path::{Path, PathBuf},
sync::{atomic::Ordering, Arc},
sync::{
atomic::{AtomicUsize, Ordering},
Arc,
},
},
tempfile::TempDir,
test_case::test_case,
};
fn linear_ancestors(end_slot: u64) -> Ancestors {
@ -845,4 +850,56 @@ mod serde_snapshot_tests {
);
}
}
// no remap needed
#[test_case(456, 456, 456, 0, |_| {})]
// remap from 456 to 457, no collisions
#[test_case(456, 457, 457, 0, |_| {})]
// attempt to remap from 456 to 457, but there's a collision, so we get 458
#[test_case(456, 457, 458, 1, |tmp| {
File::create(tmp.join("123.457")).unwrap();
})]
fn test_remap_append_vec_file(
old_id: usize,
next_id: usize,
expected_remapped_id: usize,
expected_collisions: usize,
become_ungovernable: impl FnOnce(&Path),
) {
let tmp = tempfile::tempdir().unwrap();
let old_path = tmp.path().join(format!("123.{old_id}"));
let expected_remapped_path = tmp.path().join(format!("123.{expected_remapped_id}"));
File::create(&old_path).unwrap();
become_ungovernable(tmp.path());
let next_append_vec_id = AtomicAppendVecId::new(next_id as u32);
let num_collisions = AtomicUsize::new(0);
let (remapped_id, remapped_path) =
remap_append_vec_file(123, old_id, &old_path, &next_append_vec_id, &num_collisions)
.unwrap();
assert_eq!(remapped_id as usize, expected_remapped_id);
assert_eq!(&remapped_path, &expected_remapped_path);
assert_eq!(num_collisions.load(Ordering::Relaxed), expected_collisions);
}
#[test]
#[should_panic(expected = "No such file or directory")]
fn test_remap_append_vec_file_error() {
let tmp = tempfile::tempdir().unwrap();
let original_path = tmp.path().join("123.456");
// In remap_append_vec() we want to handle EEXIST (collisions), but we want to return all
// other errors
let next_append_vec_id = AtomicAppendVecId::new(457);
let num_collisions = AtomicUsize::new(0);
remap_append_vec_file(
123,
456,
&original_path,
&next_append_vec_id,
&num_collisions,
)
.unwrap();
}
}