diff --git a/Cargo.lock b/Cargo.lock index 9777cc8b57..e4faaab967 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4782,6 +4782,7 @@ dependencies = [ "solana-version", "solana-vote-program", "static_assertions", + "sys-info", "systemstat", "tempfile", "thiserror", @@ -6294,6 +6295,16 @@ dependencies = [ "unicode-xid 0.2.2", ] +[[package]] +name = "sys-info" +version = "0.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b3a0d0aba8bf96a0e1ddfdc352fc53b3df7f39318c71854910c3c4b024ae52c" +dependencies = [ + "cc", + "libc", +] + [[package]] name = "sysctl" version = "0.4.3" diff --git a/core/Cargo.toml b/core/Cargo.toml index 090b9795b6..d6d4ce3689 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -58,6 +58,7 @@ solana-vote-program = { path = "../programs/vote", version = "=1.9.0" } tempfile = "3.2.0" thiserror = "1.0" solana-rayon-threadlimit = { path = "../rayon-threadlimit", version = "=1.9.0" } +sys-info = "0.9.1" tokio = { version = "1", features = ["full"] } trees = "0.4.2" diff --git a/core/src/system_monitor_service.rs b/core/src/system_monitor_service.rs index d310aba208..07bea88a58 100644 --- a/core/src/system_monitor_service.rs +++ b/core/src/system_monitor_service.rs @@ -1,3 +1,4 @@ +use solana_sdk::timing::AtomicInterval; use std::{ collections::HashMap, io::BufRead, @@ -6,13 +7,15 @@ use std::{ Arc, }, thread::{self, sleep, Builder, JoinHandle}, - time::{Duration, Instant}, + time::Duration, }; #[cfg(target_os = "linux")] use std::{fs::File, io::BufReader, path::Path}; -const SAMPLE_INTERVAL: Duration = Duration::from_secs(60); +const MS_PER_S: u64 = 1_000; +const SAMPLE_INTERVAL_UDP_MS: u64 = 60 * MS_PER_S; +const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S; const SLEEP_INTERVAL: Duration = Duration::from_millis(500); #[cfg(target_os = "linux")] @@ -173,21 +176,68 @@ impl SystemMonitorService { ); } + fn calc_percent(numerator: u64, denom: u64) -> f32 { + if denom == 0 { + 0.0 + } else { + (numerator as f32 / denom as f32) * 100.0 + } + } + + fn report_mem_stats() { + if let Ok(info) = sys_info::mem_info() { + datapoint_info!( + "memory-stats", + ("total", info.total, i64), + ("swap_total", info.swap_total, i64), + ( + "free_percent", + Self::calc_percent(info.free, info.total), + f64 + ), + ("used_bytes", info.total.saturating_sub(info.avail), i64), + ( + "avail_percent", + Self::calc_percent(info.avail, info.total), + f64 + ), + ( + "buffers_percent", + Self::calc_percent(info.buffers, info.total), + f64 + ), + ( + "cached_percent", + Self::calc_percent(info.cached, info.total), + f64 + ), + ( + "swap_free_percent", + Self::calc_percent(info.swap_free, info.swap_total), + f64 + ), + ) + } + } + pub fn run(exit: Arc) { let mut udp_stats = None; - let mut now = Instant::now(); + let udp_timer = AtomicInterval::default(); + let mem_timer = AtomicInterval::default(); loop { if exit.load(Ordering::Relaxed) { break; } - if now.elapsed() >= SAMPLE_INTERVAL { - now = Instant::now(); - + if udp_timer.should_update(SAMPLE_INTERVAL_UDP_MS) { SystemMonitorService::process_udp_stats(&mut udp_stats); } + if mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) { + SystemMonitorService::report_mem_stats(); + } + sleep(SLEEP_INTERVAL); } } diff --git a/runtime/src/bucket_map_holder_stats.rs b/runtime/src/bucket_map_holder_stats.rs index 1526d8e595..335b24be19 100644 --- a/runtime/src/bucket_map_holder_stats.rs +++ b/runtime/src/bucket_map_holder_stats.rs @@ -125,7 +125,7 @@ impl BucketMapHolderStats { } } - fn calc_percent(&self, ms: u64, elapsed_ms: u64) -> f32 { + fn calc_percent(ms: u64, elapsed_ms: u64) -> f32 { if elapsed_ms == 0 { 0.0 } else { @@ -183,7 +183,7 @@ impl BucketMapHolderStats { ("count", self.count.load(Ordering::Relaxed), i64), ( "bg_waiting_percent", - self.calc_percent( + Self::calc_percent( self.bg_waiting_us.swap(0, Ordering::Relaxed) / US_PER_MS, thread_time_elapsed_ms ), @@ -191,7 +191,7 @@ impl BucketMapHolderStats { ), ( "bg_throttling_wait_percent", - self.calc_percent( + Self::calc_percent( self.bg_throttling_wait_us.swap(0, Ordering::Relaxed) / US_PER_MS, thread_time_elapsed_ms ), diff --git a/sdk/src/timing.rs b/sdk/src/timing.rs index c1d2002fa0..c56fe59393 100644 --- a/sdk/src/timing.rs +++ b/sdk/src/timing.rs @@ -28,6 +28,7 @@ pub fn duration_as_s(d: &Duration) -> f32 { d.as_secs() as f32 + (d.subsec_nanos() as f32 / 1_000_000_000.0) } +/// return timestamp as ms pub fn timestamp() -> u64 { let now = SystemTime::now() .duration_since(UNIX_EPOCH) @@ -66,14 +67,18 @@ pub struct AtomicInterval { } impl AtomicInterval { - pub fn should_update(&self, interval_time: u64) -> bool { - self.should_update_ext(interval_time, true) + /// true if 'interval_time_ms' has elapsed since last time we returned true as long as it has been 'interval_time_ms' since this struct was created + pub fn should_update(&self, interval_time_ms: u64) -> bool { + self.should_update_ext(interval_time_ms, true) } - pub fn should_update_ext(&self, interval_time: u64, skip_first: bool) -> bool { + /// a primary use case is periodic metric reporting, potentially from different threads + /// true if 'interval_time_ms' has elapsed since last time we returned true + /// except, if skip_first=false, false until 'interval_time_ms' has elapsed since this struct was created + pub fn should_update_ext(&self, interval_time_ms: u64, skip_first: bool) -> bool { let now = timestamp(); let last = self.last_update.load(Ordering::Relaxed); - now.saturating_sub(last) > interval_time + now.saturating_sub(last) > interval_time_ms && self .last_update .compare_exchange(last, now, Ordering::Relaxed, Ordering::Relaxed)