solana/runtime/src/bucket_map_holder.rs

553 lines
22 KiB
Rust

use {
crate::{
accounts_index::{AccountsIndexConfig, IndexLimitMb, IndexValue},
bucket_map_holder_stats::BucketMapHolderStats,
in_mem_accounts_index::InMemAccountsIndex,
waitable_condvar::WaitableCondvar,
},
solana_bucket_map::bucket_map::{BucketMap, BucketMapConfig},
solana_measure::measure::Measure,
solana_sdk::{
clock::{Slot, DEFAULT_MS_PER_SLOT},
timing::AtomicInterval,
},
std::{
fmt::Debug,
sync::{
atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering},
Arc,
},
time::Duration,
},
};
pub type Age = u8;
const AGE_MS: u64 = DEFAULT_MS_PER_SLOT; // match one age per slot time
// 10 GB limit for in-mem idx. In practice, we don't get this high. This tunes how aggressively to save items we expect to use soon.
pub const DEFAULT_DISK_INDEX: Option<usize> = Some(10_000);
pub struct BucketMapHolder<T: IndexValue> {
pub disk: Option<BucketMap<(Slot, T)>>,
pub count_buckets_flushed: AtomicUsize,
/// These three ages are individual atomics because their values are read many times from code during runtime.
/// Instead of accessing the single age and doing math each time, each value is incremented each time the age occurs, which is ~400ms.
/// Callers can ask for the precomputed value they already want.
/// rolling 'current' age
pub age: AtomicU8,
/// rolling age that is 'ages_to_stay_in_cache' + 'age'
pub future_age_to_flush: AtomicU8,
/// rolling age that is effectively 'age' - 1
/// these items are expected to be flushed from the accounts write cache or otherwise modified before this age occurs
pub future_age_to_flush_cached: AtomicU8,
pub stats: BucketMapHolderStats,
age_timer: AtomicInterval,
// used by bg processing to know when any bucket has become dirty
pub wait_dirty_or_aged: Arc<WaitableCondvar>,
next_bucket_to_flush: AtomicUsize,
bins: usize,
pub threads: usize,
// how much mb are we allowed to keep in the in-mem index?
// Rest goes to disk.
pub mem_budget_mb: Option<usize>,
/// how many ages should elapse from the last time an item is used where the item will remain in the cache
pub ages_to_stay_in_cache: Age,
/// startup is a special time for flush to focus on moving everything to disk as fast and efficiently as possible
/// with less thread count limitations. LRU and access patterns are not important. Freeing memory
/// and writing to disk in parallel are.
/// Note startup is an optimization and is not required for correctness.
startup: AtomicBool,
}
impl<T: IndexValue> Debug for BucketMapHolder<T> {
fn fmt(&self, _f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
Ok(())
}
}
#[allow(clippy::mutex_atomic)]
impl<T: IndexValue> BucketMapHolder<T> {
/// is the accounts index using disk as a backing store
pub fn is_disk_index_enabled(&self) -> bool {
self.disk.is_some()
}
pub fn increment_age(&self) {
// since we are about to change age, there are now 0 buckets that have been flushed at this age
// this should happen before the age.fetch_add
// Otherwise, as soon as we increment the age, a thread could race us and flush before we swap this out since it detects the age has moved forward and a bucket will be eligible for flushing.
let previous = self.count_buckets_flushed.swap(0, Ordering::AcqRel);
// fetch_add is defined to wrap.
// That's what we want. 0..255, then back to 0.
self.age.fetch_add(1, Ordering::Release);
self.future_age_to_flush.fetch_add(1, Ordering::Release);
self.future_age_to_flush_cached
.fetch_add(1, Ordering::Release);
assert!(
previous >= self.bins,
"previous: {}, bins: {}",
previous,
self.bins
); // we should not have increased age before previous age was fully flushed
self.wait_dirty_or_aged.notify_all(); // notify all because we can age scan in parallel
}
pub fn future_age_to_flush(&self, is_cached: bool) -> Age {
if is_cached {
&self.future_age_to_flush_cached
} else {
&self.future_age_to_flush
}
.load(Ordering::Acquire)
}
fn has_age_interval_elapsed(&self) -> bool {
// note that when this returns true, state of age_timer is modified
self.age_timer.should_update(self.age_interval_ms())
}
/// used by bg processes to determine # active threads and how aggressively to flush
pub fn get_startup(&self) -> bool {
self.startup.load(Ordering::Relaxed)
}
/// startup=true causes:
/// in mem to act in a way that flushes to disk asap
/// startup=false is 'normal' operation
pub fn set_startup(&self, value: bool) {
if !value {
self.wait_for_idle();
}
self.startup.store(value, Ordering::Relaxed)
}
/// return when the bg threads have reached an 'idle' state
pub(crate) fn wait_for_idle(&self) {
assert!(self.get_startup());
if self.disk.is_none() {
return;
}
// when age has incremented twice, we know that we have made it through scanning all bins since we started waiting,
// so we are then 'idle'
let end_age = self.current_age().wrapping_add(2);
loop {
self.wait_dirty_or_aged
.wait_timeout(Duration::from_millis(self.age_interval_ms()));
if end_age == self.current_age() {
break;
}
}
}
pub fn current_age(&self) -> Age {
self.age.load(Ordering::Acquire)
}
pub fn bucket_flushed_at_current_age(&self, can_advance_age: bool) {
let count_buckets_flushed = 1 + self.count_buckets_flushed.fetch_add(1, Ordering::AcqRel);
if can_advance_age {
self.maybe_advance_age_internal(
self.all_buckets_flushed_at_current_age_internal(count_buckets_flushed),
);
}
}
/// have all buckets been flushed at the current age?
pub fn all_buckets_flushed_at_current_age(&self) -> bool {
self.all_buckets_flushed_at_current_age_internal(self.count_buckets_flushed())
}
/// have all buckets been flushed at the current age?
fn all_buckets_flushed_at_current_age_internal(&self, count_buckets_flushed: usize) -> bool {
count_buckets_flushed >= self.bins
}
pub fn count_buckets_flushed(&self) -> usize {
self.count_buckets_flushed.load(Ordering::Acquire)
}
/// if all buckets are flushed at the current age and time has elapsed, then advance age
pub fn maybe_advance_age(&self) -> bool {
self.maybe_advance_age_internal(self.all_buckets_flushed_at_current_age())
}
/// if all buckets are flushed at the current age and time has elapsed, then advance age
fn maybe_advance_age_internal(&self, all_buckets_flushed_at_current_age: bool) -> bool {
// call has_age_interval_elapsed last since calling it modifies state on success
if all_buckets_flushed_at_current_age && self.has_age_interval_elapsed() {
self.increment_age();
true
} else {
false
}
}
pub fn new(bins: usize, config: &Option<AccountsIndexConfig>, threads: usize) -> Self {
const DEFAULT_AGE_TO_STAY_IN_CACHE: Age = 5;
let ages_to_stay_in_cache = config
.as_ref()
.and_then(|config| config.ages_to_stay_in_cache)
.unwrap_or(DEFAULT_AGE_TO_STAY_IN_CACHE);
let mut bucket_config = BucketMapConfig::new(bins);
bucket_config.drives = config.as_ref().and_then(|config| config.drives.clone());
let mem_budget_mb = match config
.as_ref()
.map(|config| &config.index_limit_mb)
.unwrap_or(&IndexLimitMb::Unspecified)
{
// creator said to use disk idx with a specific limit
IndexLimitMb::Limit(mb) => Some(*mb),
// creator said InMemOnly, so no disk index
IndexLimitMb::InMemOnly => None,
// whatever started us didn't specify whether to use the acct idx
IndexLimitMb::Unspecified => {
// check env var if we were not started from a validator
let mut use_default = true;
if !config
.as_ref()
.map(|config| config.started_from_validator)
.unwrap_or_default()
{
if let Ok(_limit) = std::env::var("SOLANA_TEST_ACCOUNTS_INDEX_MEMORY_LIMIT_MB")
{
// Note this env var means the opposite of the default. The default now is disk index is on.
// So, if this env var is set, DO NOT allocate with disk buckets if mem budget was not set, we were NOT started from validator, and env var was set
// we do not want the env var to have an effect when running the validator (only tests, benches, etc.)
use_default = false;
}
}
if use_default {
// if validator does not specify disk index limit or specify in mem only, then this is the default
DEFAULT_DISK_INDEX
} else {
None
}
}
};
// only allocate if mem_budget_mb is Some
let disk = mem_budget_mb.map(|_| BucketMap::new(bucket_config));
Self {
disk,
ages_to_stay_in_cache,
count_buckets_flushed: AtomicUsize::default(),
// age = 0
age: AtomicU8::default(),
// future age = age (=0) + ages_to_stay_in_cache
future_age_to_flush: AtomicU8::new(ages_to_stay_in_cache),
// effectively age (0) - 1. So, the oldest possible age from 'now'
future_age_to_flush_cached: AtomicU8::new(0_u8.wrapping_sub(1)),
stats: BucketMapHolderStats::new(bins),
wait_dirty_or_aged: Arc::default(),
next_bucket_to_flush: AtomicUsize::new(0),
age_timer: AtomicInterval::default(),
bins,
startup: AtomicBool::default(),
mem_budget_mb,
threads,
}
}
// get the next bucket to flush, with the idea that the previous bucket
// is perhaps being flushed by another thread already.
pub fn next_bucket_to_flush(&self) -> usize {
self.next_bucket_to_flush
.fetch_update(Ordering::AcqRel, Ordering::Acquire, |bucket| {
Some((bucket + 1) % self.bins)
})
.unwrap()
}
/// prepare for this to be dynamic if necessary
/// For example, maybe startup has a shorter age interval.
fn age_interval_ms(&self) -> u64 {
AGE_MS
}
/// return an amount of ms to sleep
fn throttling_wait_ms_internal(
&self,
interval_ms: u64,
elapsed_ms: u64,
bins_flushed: u64,
) -> Option<u64> {
let target_percent = 90; // aim to finish in 90% of the allocated time
let remaining_ms = (interval_ms * target_percent / 100).saturating_sub(elapsed_ms);
let remaining_bins = (self.bins as u64).saturating_sub(bins_flushed);
if remaining_bins == 0 || remaining_ms == 0 || elapsed_ms == 0 || bins_flushed == 0 {
// any of these conditions result in 'do not wait due to progress'
return None;
}
let ms_per_s = 1_000;
let rate_bins_per_s = bins_flushed * ms_per_s / elapsed_ms;
let expected_bins_processed_in_remaining_time = rate_bins_per_s * remaining_ms / ms_per_s;
if expected_bins_processed_in_remaining_time > remaining_bins {
// wait because we predict will finish prior to target
Some(1)
} else {
// do not wait because we predict will finish after target
None
}
}
/// Check progress this age.
/// Return ms to wait to get closer to the wait target and spread out work over the entire age interval.
/// Goal is to avoid cpu spikes at beginning of age interval.
fn throttling_wait_ms(&self) -> Option<u64> {
let interval_ms = self.age_interval_ms();
let elapsed_ms = self.age_timer.elapsed_ms();
let bins_flushed = self.count_buckets_flushed() as u64;
self.throttling_wait_ms_internal(interval_ms, elapsed_ms, bins_flushed)
}
/// true if this thread can sleep
fn should_thread_sleep(&self) -> bool {
let bins_flushed = self.count_buckets_flushed();
if bins_flushed >= self.bins {
// all bins flushed, so this thread can sleep
true
} else {
// at least 1 thread running for each bin that still needs to be flushed, so this thread can sleep
let active = self.stats.active_threads.load(Ordering::Relaxed);
bins_flushed.saturating_add(active as usize) >= self.bins
}
}
// intended to execute in a bg thread
pub fn background(
&self,
exit: Vec<Arc<AtomicBool>>,
in_mem: Vec<Arc<InMemAccountsIndex<T>>>,
can_advance_age: bool,
) {
let bins = in_mem.len();
let flush = self.disk.is_some();
let mut throttling_wait_ms = None;
loop {
if !flush {
self.wait_dirty_or_aged.wait_timeout(Duration::from_millis(
self.stats.remaining_until_next_interval(),
));
} else if self.should_thread_sleep() || throttling_wait_ms.is_some() {
let mut wait = std::cmp::min(
self.age_timer
.remaining_until_next_interval(self.age_interval_ms()),
self.stats.remaining_until_next_interval(),
);
if !can_advance_age {
// if this thread cannot advance age, then make sure we don't sleep 0
wait = wait.max(1);
}
if let Some(throttling_wait_ms) = throttling_wait_ms {
self.stats
.bg_throttling_wait_us
.fetch_add(throttling_wait_ms * 1000, Ordering::Relaxed);
wait = std::cmp::min(throttling_wait_ms, wait);
}
let mut m = Measure::start("wait");
self.wait_dirty_or_aged
.wait_timeout(Duration::from_millis(wait));
m.stop();
self.stats
.bg_waiting_us
.fetch_add(m.as_us(), Ordering::Relaxed);
// likely some time has elapsed. May have been waiting for age time interval to elapse.
if can_advance_age {
self.maybe_advance_age();
}
}
throttling_wait_ms = None;
if exit.iter().any(|exit| exit.load(Ordering::Relaxed)) {
break;
}
self.stats.active_threads.fetch_add(1, Ordering::Relaxed);
for _ in 0..bins {
if flush {
let index = self.next_bucket_to_flush();
in_mem[index].flush(can_advance_age);
}
self.stats.report_stats(self);
if self.all_buckets_flushed_at_current_age() {
break;
}
throttling_wait_ms = self.throttling_wait_ms();
if throttling_wait_ms.is_some() {
break;
}
}
self.stats.active_threads.fetch_sub(1, Ordering::Relaxed);
}
}
}
#[cfg(test)]
pub mod tests {
use {super::*, rayon::prelude::*, std::time::Instant};
#[test]
fn test_next_bucket_to_flush() {
solana_logger::setup();
let bins = 4;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
let visited = (0..bins)
.map(|_| AtomicUsize::default())
.collect::<Vec<_>>();
let iterations = bins * 30;
let threads = bins * 4;
let expected = threads * iterations / bins;
(0..threads).into_par_iter().for_each(|_| {
(0..iterations).for_each(|_| {
let bin = test.next_bucket_to_flush();
visited[bin].fetch_add(1, Ordering::Relaxed);
});
});
visited.iter().enumerate().for_each(|(bin, visited)| {
assert_eq!(visited.load(Ordering::Relaxed), expected, "bin: {bin}")
});
}
#[test]
fn test_ages() {
solana_logger::setup();
let bins = 4;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
assert_eq!(0, test.current_age());
assert_eq!(test.ages_to_stay_in_cache, test.future_age_to_flush(false));
assert_eq!(u8::MAX, test.future_age_to_flush(true));
(0..bins).for_each(|_| {
test.bucket_flushed_at_current_age(false);
});
test.increment_age();
assert_eq!(1, test.current_age());
assert_eq!(
test.ages_to_stay_in_cache + 1,
test.future_age_to_flush(false)
);
assert_eq!(0, test.future_age_to_flush(true));
}
#[test]
fn test_age_increment() {
solana_logger::setup();
let bins = 4;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
for age in 0..513 {
assert_eq!(test.current_age(), (age % 256) as Age);
// inc all
for _ in 0..bins {
assert!(!test.all_buckets_flushed_at_current_age());
// cannot call this because based on timing, it may fire: test.bucket_flushed_at_current_age();
}
// this would normally happen once time went off and all buckets had been flushed at the previous age
test.count_buckets_flushed
.fetch_add(bins, Ordering::Release);
test.increment_age();
}
}
#[test]
fn test_throttle() {
solana_logger::setup();
let bins = 128;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
let bins = test.bins as u64;
let interval_ms = test.age_interval_ms();
// 90% of time elapsed, all but 1 bins flushed, should not wait since we'll end up right on time
let elapsed_ms = interval_ms * 89 / 100;
let bins_flushed = bins - 1;
let result = test.throttling_wait_ms_internal(interval_ms, elapsed_ms, bins_flushed);
assert_eq!(result, None);
// 10% of time, all bins but 1, should wait
let elapsed_ms = interval_ms / 10;
let bins_flushed = bins - 1;
let result = test.throttling_wait_ms_internal(interval_ms, elapsed_ms, bins_flushed);
assert_eq!(result, Some(1));
// 5% of time, 8% of bins, should wait. target is 90%. These #s roughly work
let elapsed_ms = interval_ms * 5 / 100;
let bins_flushed = bins * 8 / 100;
let result = test.throttling_wait_ms_internal(interval_ms, elapsed_ms, bins_flushed);
assert_eq!(result, Some(1));
// 11% of time, 12% of bins, should NOT wait. target is 90%. These #s roughly work
let elapsed_ms = interval_ms * 11 / 100;
let bins_flushed = bins * 12 / 100;
let result = test.throttling_wait_ms_internal(interval_ms, elapsed_ms, bins_flushed);
assert_eq!(result, None);
}
#[test]
fn test_disk_index_enabled() {
let bins = 1;
let config = AccountsIndexConfig {
index_limit_mb: IndexLimitMb::Limit(0),
..AccountsIndexConfig::default()
};
let test = BucketMapHolder::<u64>::new(bins, &Some(config), 1);
assert!(test.is_disk_index_enabled());
}
#[test]
fn test_age_time() {
solana_logger::setup();
let bins = 1;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
let threads = 2;
let time = AGE_MS * 8 / 3;
let expected = (time / AGE_MS) as Age;
let now = Instant::now();
test.bucket_flushed_at_current_age(true); // done with age 0
(0..threads).into_par_iter().for_each(|_| {
// This test used to be more strict with time, but in a parallel, multi test environment,
// sometimes threads starve and this test intermittently fails. So, give it more time than it should require.
// This may be aggrevated by the strategy of only allowing thread 0 to advance the age.
while now.elapsed().as_millis() < (time as u128) * 100 {
if test.maybe_advance_age() {
test.bucket_flushed_at_current_age(true);
}
if test.current_age() >= expected {
break;
}
}
});
assert!(
test.current_age() >= expected,
"{}, {}",
test.current_age(),
expected
);
}
#[test]
fn test_age_broad() {
solana_logger::setup();
let bins = 4;
let test = BucketMapHolder::<u64>::new(bins, &Some(AccountsIndexConfig::default()), 1);
assert_eq!(test.current_age(), 0);
for _ in 0..bins {
assert!(!test.all_buckets_flushed_at_current_age());
test.bucket_flushed_at_current_age(true);
}
std::thread::sleep(std::time::Duration::from_millis(AGE_MS * 2));
test.maybe_advance_age();
assert_eq!(test.current_age(), 1);
assert!(!test.all_buckets_flushed_at_current_age());
}
}