Add CPUmetrics (#25802)

Add in some CPU utilization metrics such as: number of vCPUs, clock frequency, average load across different time intervals, and number of total threads
This commit is contained in:
Brennan Watt 2022-06-07 11:34:25 -07:00 committed by GitHub
parent d4e4871c47
commit ba04063956
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 65 additions and 2 deletions

View File

@ -12,6 +12,7 @@ use {
thread::{self, sleep, Builder, JoinHandle},
time::Duration,
},
sys_info::{Error, LoadAvg},
};
const MS_PER_S: u64 = 1_000;
@ -20,6 +21,7 @@ const MS_PER_H: u64 = MS_PER_M * 60;
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
const SAMPLE_INTERVAL_CPU_MS: u64 = MS_PER_S;
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
#[cfg(target_os = "linux")]
@ -41,6 +43,13 @@ struct UdpStats {
ignored_multi: usize,
}
struct CpuInfo {
cpu_num: u32,
cpu_freq_mhz: u64,
load_avg: LoadAvg,
num_threads: u64,
}
impl UdpStats {
fn from_map(udp_stats: &HashMap<String, usize>) -> Self {
Self {
@ -121,12 +130,18 @@ impl SystemMonitorService {
exit: Arc<AtomicBool>,
report_os_memory_stats: bool,
report_os_network_stats: bool,
report_os_cpu_stats: bool,
) -> Self {
info!("Starting SystemMonitorService");
let thread_hdl = Builder::new()
.name("system-monitor".to_string())
.spawn(move || {
Self::run(exit, report_os_memory_stats, report_os_network_stats);
Self::run(
exit,
report_os_memory_stats,
report_os_network_stats,
report_os_cpu_stats,
);
})
.unwrap();
@ -335,11 +350,45 @@ impl SystemMonitorService {
}
}
pub fn run(exit: Arc<AtomicBool>, report_os_memory_stats: bool, report_os_network_stats: bool) {
fn cpu_info() -> Result<CpuInfo, Error> {
let cpu_num = sys_info::cpu_num()?;
let cpu_freq_mhz = sys_info::cpu_speed()?;
let load_avg = sys_info::loadavg()?;
let num_threads = sys_info::proc_total()?;
Ok(CpuInfo {
cpu_num,
cpu_freq_mhz,
load_avg,
num_threads,
})
}
fn report_cpu_stats() {
if let Ok(info) = Self::cpu_info() {
datapoint_info!(
"cpu-stats",
("cpu_num", info.cpu_num as i64, i64),
("cpu0_freq_mhz", info.cpu_freq_mhz as i64, i64),
("average_load_one_minute", info.load_avg.one, f64),
("average_load_five_minutes", info.load_avg.five, f64),
("average_load_fifteen_minutes", info.load_avg.fifteen, f64),
("total_num_threads", info.num_threads as i64, i64),
)
}
}
pub fn run(
exit: Arc<AtomicBool>,
report_os_memory_stats: bool,
report_os_network_stats: bool,
report_os_cpu_stats: bool,
) {
let mut udp_stats = None;
let network_limits_timer = AtomicInterval::default();
let udp_timer = AtomicInterval::default();
let mem_timer = AtomicInterval::default();
let cpu_timer = AtomicInterval::default();
loop {
if exit.load(Ordering::Relaxed) {
@ -356,6 +405,9 @@ impl SystemMonitorService {
if report_os_memory_stats && mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
Self::report_mem_stats();
}
if report_os_cpu_stats && cpu_timer.should_update(SAMPLE_INTERVAL_CPU_MS) {
Self::report_cpu_stats();
}
sleep(SLEEP_INTERVAL);
}
}

View File

@ -159,6 +159,7 @@ pub struct ValidatorConfig {
pub no_poh_speed_test: bool,
pub no_os_memory_stats_reporting: bool,
pub no_os_network_stats_reporting: bool,
pub no_os_cpu_stats_reporting: bool,
pub poh_pinned_cpu_core: usize,
pub poh_hashes_per_batch: u64,
pub account_indexes: AccountSecondaryIndexes,
@ -219,6 +220,7 @@ impl Default for ValidatorConfig {
no_poh_speed_test: true,
no_os_memory_stats_reporting: true,
no_os_network_stats_reporting: true,
no_os_cpu_stats_reporting: true,
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
account_indexes: AccountSecondaryIndexes::default(),
@ -497,6 +499,7 @@ impl Validator {
Arc::clone(&exit),
!config.no_os_memory_stats_reporting,
!config.no_os_network_stats_reporting,
!config.no_os_cpu_stats_reporting,
));
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();

View File

@ -2182,6 +2182,7 @@ fn main() {
Arc::clone(&exit_signal),
!no_os_memory_stats_reporting,
false,
false,
);
accounts_index_config.index_limit_mb = if let Some(limit) =

View File

@ -47,6 +47,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
no_poh_speed_test: config.no_poh_speed_test,
no_os_memory_stats_reporting: config.no_os_memory_stats_reporting,
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
account_indexes: config.account_indexes.clone(),
accounts_db_caching_enabled: config.accounts_db_caching_enabled,

View File

@ -965,6 +965,11 @@ pub fn main() {
.long("no-os-network-stats-reporting")
.help("Disable reporting of OS network statistics.")
)
.arg(
Arg::with_name("no_os_cpu_stats_reporting")
.long("no-os-cpu-stats-reporting")
.help("Disable reporting of OS CPU statistics.")
)
.arg(
Arg::with_name("accounts-hash-interval-slots")
.long("accounts-hash-interval-slots")
@ -2529,6 +2534,7 @@ pub fn main() {
no_poh_speed_test: matches.is_present("no_poh_speed_test"),
no_os_memory_stats_reporting: matches.is_present("no_os_memory_stats_reporting"),
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")