Add CPUmetrics (#25802)
Add in some CPU utilization metrics such as: number of vCPUs, clock frequency, average load across different time intervals, and number of total threads
This commit is contained in:
parent
d4e4871c47
commit
ba04063956
|
@ -12,6 +12,7 @@ use {
|
||||||
thread::{self, sleep, Builder, JoinHandle},
|
thread::{self, sleep, Builder, JoinHandle},
|
||||||
time::Duration,
|
time::Duration,
|
||||||
},
|
},
|
||||||
|
sys_info::{Error, LoadAvg},
|
||||||
};
|
};
|
||||||
|
|
||||||
const MS_PER_S: u64 = 1_000;
|
const MS_PER_S: u64 = 1_000;
|
||||||
|
@ -20,6 +21,7 @@ const MS_PER_H: u64 = MS_PER_M * 60;
|
||||||
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
|
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
|
||||||
const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
|
const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
|
||||||
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
|
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
|
||||||
|
const SAMPLE_INTERVAL_CPU_MS: u64 = MS_PER_S;
|
||||||
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
|
@ -41,6 +43,13 @@ struct UdpStats {
|
||||||
ignored_multi: usize,
|
ignored_multi: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
struct CpuInfo {
|
||||||
|
cpu_num: u32,
|
||||||
|
cpu_freq_mhz: u64,
|
||||||
|
load_avg: LoadAvg,
|
||||||
|
num_threads: u64,
|
||||||
|
}
|
||||||
|
|
||||||
impl UdpStats {
|
impl UdpStats {
|
||||||
fn from_map(udp_stats: &HashMap<String, usize>) -> Self {
|
fn from_map(udp_stats: &HashMap<String, usize>) -> Self {
|
||||||
Self {
|
Self {
|
||||||
|
@ -121,12 +130,18 @@ impl SystemMonitorService {
|
||||||
exit: Arc<AtomicBool>,
|
exit: Arc<AtomicBool>,
|
||||||
report_os_memory_stats: bool,
|
report_os_memory_stats: bool,
|
||||||
report_os_network_stats: bool,
|
report_os_network_stats: bool,
|
||||||
|
report_os_cpu_stats: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
info!("Starting SystemMonitorService");
|
info!("Starting SystemMonitorService");
|
||||||
let thread_hdl = Builder::new()
|
let thread_hdl = Builder::new()
|
||||||
.name("system-monitor".to_string())
|
.name("system-monitor".to_string())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
Self::run(exit, report_os_memory_stats, report_os_network_stats);
|
Self::run(
|
||||||
|
exit,
|
||||||
|
report_os_memory_stats,
|
||||||
|
report_os_network_stats,
|
||||||
|
report_os_cpu_stats,
|
||||||
|
);
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
@ -335,11 +350,45 @@ impl SystemMonitorService {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn run(exit: Arc<AtomicBool>, report_os_memory_stats: bool, report_os_network_stats: bool) {
|
fn cpu_info() -> Result<CpuInfo, Error> {
|
||||||
|
let cpu_num = sys_info::cpu_num()?;
|
||||||
|
let cpu_freq_mhz = sys_info::cpu_speed()?;
|
||||||
|
let load_avg = sys_info::loadavg()?;
|
||||||
|
let num_threads = sys_info::proc_total()?;
|
||||||
|
|
||||||
|
Ok(CpuInfo {
|
||||||
|
cpu_num,
|
||||||
|
cpu_freq_mhz,
|
||||||
|
load_avg,
|
||||||
|
num_threads,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn report_cpu_stats() {
|
||||||
|
if let Ok(info) = Self::cpu_info() {
|
||||||
|
datapoint_info!(
|
||||||
|
"cpu-stats",
|
||||||
|
("cpu_num", info.cpu_num as i64, i64),
|
||||||
|
("cpu0_freq_mhz", info.cpu_freq_mhz as i64, i64),
|
||||||
|
("average_load_one_minute", info.load_avg.one, f64),
|
||||||
|
("average_load_five_minutes", info.load_avg.five, f64),
|
||||||
|
("average_load_fifteen_minutes", info.load_avg.fifteen, f64),
|
||||||
|
("total_num_threads", info.num_threads as i64, i64),
|
||||||
|
)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn run(
|
||||||
|
exit: Arc<AtomicBool>,
|
||||||
|
report_os_memory_stats: bool,
|
||||||
|
report_os_network_stats: bool,
|
||||||
|
report_os_cpu_stats: bool,
|
||||||
|
) {
|
||||||
let mut udp_stats = None;
|
let mut udp_stats = None;
|
||||||
let network_limits_timer = AtomicInterval::default();
|
let network_limits_timer = AtomicInterval::default();
|
||||||
let udp_timer = AtomicInterval::default();
|
let udp_timer = AtomicInterval::default();
|
||||||
let mem_timer = AtomicInterval::default();
|
let mem_timer = AtomicInterval::default();
|
||||||
|
let cpu_timer = AtomicInterval::default();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if exit.load(Ordering::Relaxed) {
|
if exit.load(Ordering::Relaxed) {
|
||||||
|
@ -356,6 +405,9 @@ impl SystemMonitorService {
|
||||||
if report_os_memory_stats && mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
|
if report_os_memory_stats && mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
|
||||||
Self::report_mem_stats();
|
Self::report_mem_stats();
|
||||||
}
|
}
|
||||||
|
if report_os_cpu_stats && cpu_timer.should_update(SAMPLE_INTERVAL_CPU_MS) {
|
||||||
|
Self::report_cpu_stats();
|
||||||
|
}
|
||||||
sleep(SLEEP_INTERVAL);
|
sleep(SLEEP_INTERVAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -159,6 +159,7 @@ pub struct ValidatorConfig {
|
||||||
pub no_poh_speed_test: bool,
|
pub no_poh_speed_test: bool,
|
||||||
pub no_os_memory_stats_reporting: bool,
|
pub no_os_memory_stats_reporting: bool,
|
||||||
pub no_os_network_stats_reporting: bool,
|
pub no_os_network_stats_reporting: bool,
|
||||||
|
pub no_os_cpu_stats_reporting: bool,
|
||||||
pub poh_pinned_cpu_core: usize,
|
pub poh_pinned_cpu_core: usize,
|
||||||
pub poh_hashes_per_batch: u64,
|
pub poh_hashes_per_batch: u64,
|
||||||
pub account_indexes: AccountSecondaryIndexes,
|
pub account_indexes: AccountSecondaryIndexes,
|
||||||
|
@ -219,6 +220,7 @@ impl Default for ValidatorConfig {
|
||||||
no_poh_speed_test: true,
|
no_poh_speed_test: true,
|
||||||
no_os_memory_stats_reporting: true,
|
no_os_memory_stats_reporting: true,
|
||||||
no_os_network_stats_reporting: true,
|
no_os_network_stats_reporting: true,
|
||||||
|
no_os_cpu_stats_reporting: true,
|
||||||
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
|
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
|
||||||
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
|
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
|
||||||
account_indexes: AccountSecondaryIndexes::default(),
|
account_indexes: AccountSecondaryIndexes::default(),
|
||||||
|
@ -497,6 +499,7 @@ impl Validator {
|
||||||
Arc::clone(&exit),
|
Arc::clone(&exit),
|
||||||
!config.no_os_memory_stats_reporting,
|
!config.no_os_memory_stats_reporting,
|
||||||
!config.no_os_network_stats_reporting,
|
!config.no_os_network_stats_reporting,
|
||||||
|
!config.no_os_cpu_stats_reporting,
|
||||||
));
|
));
|
||||||
|
|
||||||
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();
|
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();
|
||||||
|
|
|
@ -2182,6 +2182,7 @@ fn main() {
|
||||||
Arc::clone(&exit_signal),
|
Arc::clone(&exit_signal),
|
||||||
!no_os_memory_stats_reporting,
|
!no_os_memory_stats_reporting,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
);
|
);
|
||||||
|
|
||||||
accounts_index_config.index_limit_mb = if let Some(limit) =
|
accounts_index_config.index_limit_mb = if let Some(limit) =
|
||||||
|
|
|
@ -47,6 +47,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
|
||||||
no_poh_speed_test: config.no_poh_speed_test,
|
no_poh_speed_test: config.no_poh_speed_test,
|
||||||
no_os_memory_stats_reporting: config.no_os_memory_stats_reporting,
|
no_os_memory_stats_reporting: config.no_os_memory_stats_reporting,
|
||||||
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
|
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
|
||||||
|
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
|
||||||
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
|
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
|
||||||
account_indexes: config.account_indexes.clone(),
|
account_indexes: config.account_indexes.clone(),
|
||||||
accounts_db_caching_enabled: config.accounts_db_caching_enabled,
|
accounts_db_caching_enabled: config.accounts_db_caching_enabled,
|
||||||
|
|
|
@ -965,6 +965,11 @@ pub fn main() {
|
||||||
.long("no-os-network-stats-reporting")
|
.long("no-os-network-stats-reporting")
|
||||||
.help("Disable reporting of OS network statistics.")
|
.help("Disable reporting of OS network statistics.")
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("no_os_cpu_stats_reporting")
|
||||||
|
.long("no-os-cpu-stats-reporting")
|
||||||
|
.help("Disable reporting of OS CPU statistics.")
|
||||||
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("accounts-hash-interval-slots")
|
Arg::with_name("accounts-hash-interval-slots")
|
||||||
.long("accounts-hash-interval-slots")
|
.long("accounts-hash-interval-slots")
|
||||||
|
@ -2529,6 +2534,7 @@ pub fn main() {
|
||||||
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
|
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
|
||||||
no_os_memory_stats_reporting: matches.is_present("no_os_memory_stats_reporting"),
|
no_os_memory_stats_reporting: matches.is_present("no_os_memory_stats_reporting"),
|
||||||
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
|
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
|
||||||
|
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
|
||||||
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
|
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
|
||||||
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
|
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
|
||||||
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")
|
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")
|
||||||
|
|
Loading…
Reference in New Issue