runtime: do fewer syscalls in remap_append_vec_file (#336)
* runtime: do fewer syscalls in remap_append_vec_file Use renameat2(src, dest, NOREPLACE) as an atomic version of if statx(dest).is_err() { rename(src, dest) }. We have high inode contention during storage rebuild and this saves 1 fs syscall for each appendvec. * Address review feedback
This commit is contained in:
parent
11aa06d24f
commit
981881544c
|
@ -6997,6 +6997,7 @@ dependencies = [
|
|||
"index_list",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"libsecp256k1",
|
||||
"log",
|
||||
"lru",
|
||||
|
|
|
@ -5695,6 +5695,7 @@ dependencies = [
|
|||
"index_list",
|
||||
"itertools",
|
||||
"lazy_static",
|
||||
"libc",
|
||||
"log",
|
||||
"lru",
|
||||
"lz4",
|
||||
|
|
|
@ -28,6 +28,7 @@ im = { workspace = true, features = ["rayon", "serde"] }
|
|||
index_list = { workspace = true }
|
||||
itertools = { workspace = true }
|
||||
lazy_static = { workspace = true }
|
||||
libc = { workspace = true }
|
||||
log = { workspace = true }
|
||||
lru = { workspace = true }
|
||||
lz4 = { workspace = true }
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
#[cfg(target_os = "linux")]
|
||||
use std::ffi::{CStr, CString};
|
||||
use {
|
||||
crate::{
|
||||
bank::{builtins::BuiltinPrototype, Bank, BankFieldsToDeserialize, BankRc},
|
||||
|
@ -655,30 +657,55 @@ pub(crate) fn reconstruct_single_storage(
|
|||
)))
|
||||
}
|
||||
|
||||
fn remap_append_vec_file(
|
||||
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
|
||||
// due to full snapshots and incremental snapshots generated from different
|
||||
// nodes
|
||||
pub(crate) fn remap_append_vec_file(
|
||||
slot: Slot,
|
||||
old_append_vec_id: SerializedAppendVecId,
|
||||
append_vec_path: &Path,
|
||||
next_append_vec_id: &AtomicAppendVecId,
|
||||
num_collisions: &AtomicUsize,
|
||||
) -> io::Result<(AppendVecId, PathBuf)> {
|
||||
// Remap the AppendVec ID to handle any duplicate IDs that may previously existed
|
||||
// due to full snapshots and incremental snapshots generated from different nodes
|
||||
#[cfg(target_os = "linux")]
|
||||
let append_vec_path_cstr = cstring_from_path(append_vec_path)?;
|
||||
|
||||
let mut remapped_append_vec_path = append_vec_path.to_path_buf();
|
||||
|
||||
// Break out of the loop in the following situations:
|
||||
// 1. The new ID is the same as the original ID. This means we do not need to
|
||||
// rename the file, since the ID is the "correct" one already.
|
||||
// 2. There is not a file already at the new path. This means it is safe to
|
||||
// rename the file to this new path.
|
||||
let (remapped_append_vec_id, remapped_append_vec_path) = loop {
|
||||
let remapped_append_vec_id = next_append_vec_id.fetch_add(1, Ordering::AcqRel);
|
||||
let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id);
|
||||
let remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);
|
||||
|
||||
// Break out of the loop in the following situations:
|
||||
// 1. The new ID is the same as the original ID. This means we do not need to
|
||||
// rename the file, since the ID is the "correct" one already.
|
||||
// 2. There is not a file already at the new path. This means it is safe to
|
||||
// rename the file to this new path.
|
||||
// **DEVELOPER NOTE:** Keep this check last so that it can short-circuit if
|
||||
// possible.
|
||||
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId
|
||||
|| std::fs::metadata(&remapped_append_vec_path).is_err()
|
||||
// this can only happen in the first iteration of the loop
|
||||
if old_append_vec_id == remapped_append_vec_id as SerializedAppendVecId {
|
||||
break (remapped_append_vec_id, remapped_append_vec_path);
|
||||
}
|
||||
|
||||
let remapped_file_name = AccountsFile::file_name(slot, remapped_append_vec_id);
|
||||
remapped_append_vec_path = append_vec_path.parent().unwrap().join(remapped_file_name);
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
{
|
||||
let remapped_append_vec_path_cstr = cstring_from_path(&remapped_append_vec_path)?;
|
||||
|
||||
// On linux we use renameat2(NO_REPLACE) instead of IF metadata(path).is_err() THEN
|
||||
// rename() in order to save a statx() syscall.
|
||||
match rename_no_replace(&append_vec_path_cstr, &remapped_append_vec_path_cstr) {
|
||||
// If the file was successfully renamed, break out of the loop
|
||||
Ok(_) => break (remapped_append_vec_id, remapped_append_vec_path),
|
||||
// If there's already a file at the new path, continue so we try
|
||||
// the next ID
|
||||
Err(e) if e.kind() == io::ErrorKind::AlreadyExists => {}
|
||||
Err(e) => return Err(e),
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
if std::fs::metadata(&remapped_append_vec_path).is_err() {
|
||||
break (remapped_append_vec_id, remapped_append_vec_path);
|
||||
}
|
||||
|
||||
|
@ -686,7 +713,10 @@ fn remap_append_vec_file(
|
|||
// and try again.
|
||||
num_collisions.fetch_add(1, Ordering::Relaxed);
|
||||
};
|
||||
// Only rename the file if the new ID is actually different from the original.
|
||||
|
||||
// Only rename the file if the new ID is actually different from the original. In the target_os
|
||||
// = linux case, we have already renamed if necessary.
|
||||
#[cfg(not(target_os = "linux"))]
|
||||
if old_append_vec_id != remapped_append_vec_id as SerializedAppendVecId {
|
||||
std::fs::rename(append_vec_path, &remapped_append_vec_path)?;
|
||||
}
|
||||
|
@ -953,3 +983,32 @@ where
|
|||
ReconstructedAccountsDbInfo { accounts_data_len },
|
||||
))
|
||||
}
|
||||
|
||||
// Rename `src` to `dest` only if `dest` doesn't already exist.
|
||||
#[cfg(target_os = "linux")]
|
||||
fn rename_no_replace(src: &CStr, dest: &CStr) -> io::Result<()> {
|
||||
let ret = unsafe {
|
||||
libc::renameat2(
|
||||
libc::AT_FDCWD,
|
||||
src.as_ptr() as *const _,
|
||||
libc::AT_FDCWD,
|
||||
dest.as_ptr() as *const _,
|
||||
libc::RENAME_NOREPLACE,
|
||||
)
|
||||
};
|
||||
if ret == -1 {
|
||||
return Err(io::Error::last_os_error());
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
#[cfg(target_os = "linux")]
|
||||
fn cstring_from_path(path: &Path) -> io::Result<CString> {
|
||||
// It is better to allocate here than use the stack. Jemalloc is going to give us a chunk of a
|
||||
// preallocated small arena anyway. Instead if we used the stack since PATH_MAX=4096 it would
|
||||
// result in LLVM inserting a stack probe, see
|
||||
// https://docs.rs/compiler_builtins/latest/compiler_builtins/probestack/index.html.
|
||||
CString::new(path.as_os_str().as_encoded_bytes())
|
||||
.map_err(|e| io::Error::new(io::ErrorKind::InvalidInput, e))
|
||||
}
|
||||
|
|
|
@ -3,8 +3,8 @@ mod serde_snapshot_tests {
|
|||
use {
|
||||
crate::{
|
||||
serde_snapshot::{
|
||||
newer, reconstruct_accountsdb_from_fields, SerdeStyle, SerializableAccountsDb,
|
||||
SnapshotAccountsDbFields, TypeContext,
|
||||
newer, reconstruct_accountsdb_from_fields, remap_append_vec_file, SerdeStyle,
|
||||
SerializableAccountsDb, SnapshotAccountsDbFields, TypeContext,
|
||||
},
|
||||
snapshot_utils::{get_storages_to_serialize, StorageAndNextAppendVecId},
|
||||
},
|
||||
|
@ -34,12 +34,17 @@ mod serde_snapshot_tests {
|
|||
rent_collector::RentCollector,
|
||||
},
|
||||
std::{
|
||||
fs::File,
|
||||
io::{BufReader, Cursor, Read, Write},
|
||||
ops::RangeFull,
|
||||
path::{Path, PathBuf},
|
||||
sync::{atomic::Ordering, Arc},
|
||||
sync::{
|
||||
atomic::{AtomicUsize, Ordering},
|
||||
Arc,
|
||||
},
|
||||
},
|
||||
tempfile::TempDir,
|
||||
test_case::test_case,
|
||||
};
|
||||
|
||||
fn linear_ancestors(end_slot: u64) -> Ancestors {
|
||||
|
@ -845,4 +850,56 @@ mod serde_snapshot_tests {
|
|||
);
|
||||
}
|
||||
}
|
||||
|
||||
// no remap needed
|
||||
#[test_case(456, 456, 456, 0, |_| {})]
|
||||
// remap from 456 to 457, no collisions
|
||||
#[test_case(456, 457, 457, 0, |_| {})]
|
||||
// attempt to remap from 456 to 457, but there's a collision, so we get 458
|
||||
#[test_case(456, 457, 458, 1, |tmp| {
|
||||
File::create(tmp.join("123.457")).unwrap();
|
||||
})]
|
||||
fn test_remap_append_vec_file(
|
||||
old_id: usize,
|
||||
next_id: usize,
|
||||
expected_remapped_id: usize,
|
||||
expected_collisions: usize,
|
||||
become_ungovernable: impl FnOnce(&Path),
|
||||
) {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let old_path = tmp.path().join(format!("123.{old_id}"));
|
||||
let expected_remapped_path = tmp.path().join(format!("123.{expected_remapped_id}"));
|
||||
File::create(&old_path).unwrap();
|
||||
|
||||
become_ungovernable(tmp.path());
|
||||
|
||||
let next_append_vec_id = AtomicAppendVecId::new(next_id as u32);
|
||||
let num_collisions = AtomicUsize::new(0);
|
||||
let (remapped_id, remapped_path) =
|
||||
remap_append_vec_file(123, old_id, &old_path, &next_append_vec_id, &num_collisions)
|
||||
.unwrap();
|
||||
assert_eq!(remapped_id as usize, expected_remapped_id);
|
||||
assert_eq!(&remapped_path, &expected_remapped_path);
|
||||
assert_eq!(num_collisions.load(Ordering::Relaxed), expected_collisions);
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[should_panic(expected = "No such file or directory")]
|
||||
fn test_remap_append_vec_file_error() {
|
||||
let tmp = tempfile::tempdir().unwrap();
|
||||
let original_path = tmp.path().join("123.456");
|
||||
|
||||
// In remap_append_vec() we want to handle EEXIST (collisions), but we want to return all
|
||||
// other errors
|
||||
let next_append_vec_id = AtomicAppendVecId::new(457);
|
||||
let num_collisions = AtomicUsize::new(0);
|
||||
remap_append_vec_file(
|
||||
123,
|
||||
456,
|
||||
&original_path,
|
||||
&next_append_vec_id,
|
||||
&num_collisions,
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue