From e1ba5a2a634337d5d2a56ec747becf16d287c28a Mon Sep 17 00:00:00 2001 From: Haoran Yi Date: Wed, 23 Nov 2022 13:58:33 -0600 Subject: [PATCH] add monitoring for open file descriptors stat --- Cargo.lock | 1 + core/Cargo.toml | 1 + core/src/system_monitor_service.rs | 43 ++++++++++++++++++++++++++ core/src/validator.rs | 3 ++ ledger-tool/src/main.rs | 1 + local-cluster/src/validator_configs.rs | 1 + programs/sbf/Cargo.lock | 1 + validator/src/cli.rs | 5 +++ validator/src/main.rs | 1 + 9 files changed, 57 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 55838e117b..0bd2719eca 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5202,6 +5202,7 @@ dependencies = [ "matches", "min-max-heap", "num_enum", + "procfs", "rand 0.7.3", "rand_chacha 0.2.2", "raptorq", diff --git a/core/Cargo.toml b/core/Cargo.toml index 8829d4b950..5c42d5f615 100644 --- a/core/Cargo.toml +++ b/core/Cargo.toml @@ -31,6 +31,7 @@ log = "0.4.17" lru = "0.7.7" min-max-heap = "1.3.0" num_enum = "0.5.7" +procfs = "0.14.1" rand = "0.7.0" rand_chacha = "0.2.2" rayon = "1.5.3" diff --git a/core/src/system_monitor_service.rs b/core/src/system_monitor_service.rs index 8806b62bc8..9911597dce 100644 --- a/core/src/system_monitor_service.rs +++ b/core/src/system_monitor_service.rs @@ -7,6 +7,7 @@ use num_enum::{IntoPrimitive, TryFromPrimitive}; #[cfg(target_os = "linux")] use std::{fs::File, io::BufReader}; use { + procfs::process::{LimitValue, Process}, solana_sdk::timing::AtomicInterval, std::{ collections::HashMap, @@ -29,6 +30,7 @@ const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H; const SAMPLE_INTERVAL_MEM_MS: u64 = 5 * MS_PER_S; const SAMPLE_INTERVAL_CPU_MS: u64 = 10 * MS_PER_S; const SAMPLE_INTERVAL_DISK_MS: u64 = 5 * MS_PER_S; +const SAMPLE_INTERVAL_OPEN_FD_MS: u64 = 30 * MS_PER_S; const SLEEP_INTERVAL: Duration = Duration::from_millis(500); #[cfg(target_os = "linux")] @@ -392,6 +394,7 @@ impl SystemMonitorService { report_os_network_stats: bool, report_os_cpu_stats: bool, report_os_disk_stats: bool, + report_os_open_fd_stats: bool, ) -> Self { info!("Starting SystemMonitorService"); let thread_hdl = Builder::new() @@ -403,6 +406,7 @@ impl SystemMonitorService { report_os_network_stats, report_os_cpu_stats, report_os_disk_stats, + report_os_open_fd_stats, ); }) .unwrap(); @@ -832,6 +836,40 @@ impl SystemMonitorService { Self::report_cpuid_values(); } + fn get_open_fd_stats() -> Option<(usize, usize, usize)> { + let proc = Process::myself().ok()?; + let curr_num_open_fd = proc.fd_count().unwrap(); + let max_open_fd_limit = proc.limits().unwrap().max_open_files; + + let max_open_fd_soft_limit = match max_open_fd_limit.soft_limit { + LimitValue::Unlimited => usize::MAX, + LimitValue::Value(x) => x as usize, + }; + let max_open_fd_hard_limit = match max_open_fd_limit.hard_limit { + LimitValue::Unlimited => usize::MAX, + LimitValue::Value(x) => x as usize, + }; + + Some(( + curr_num_open_fd, + max_open_fd_soft_limit, + max_open_fd_hard_limit, + )) + } + + fn report_open_fd_stats() { + if let Some((curr_num_open_fd, max_open_fd_soft_limit, max_open_fd_hard_limit)) = + Self::get_open_fd_stats() + { + datapoint_info!( + "open-fd-stats", + ("number_open_files", curr_num_open_fd, i64), + ("max_open_files_hard_limit", max_open_fd_hard_limit, i64), + ("max_open_files_soft_limit", max_open_fd_soft_limit, i64), + ); + } + } + #[cfg(target_os = "linux")] fn process_disk_stats(disk_stats: &mut Option) { match read_disk_stats() { @@ -973,6 +1011,7 @@ impl SystemMonitorService { report_os_network_stats: bool, report_os_cpu_stats: bool, report_os_disk_stats: bool, + report_os_open_fd_stats: bool, ) { let mut udp_stats = None; let mut disk_stats = None; @@ -981,6 +1020,7 @@ impl SystemMonitorService { let mem_timer = AtomicInterval::default(); let cpu_timer = AtomicInterval::default(); let disk_timer = AtomicInterval::default(); + let open_fd_timer = AtomicInterval::default(); loop { if exit.load(Ordering::Relaxed) { @@ -1003,6 +1043,9 @@ impl SystemMonitorService { if report_os_disk_stats && disk_timer.should_update(SAMPLE_INTERVAL_DISK_MS) { Self::process_disk_stats(&mut disk_stats); } + if report_os_open_fd_stats && open_fd_timer.should_update(SAMPLE_INTERVAL_OPEN_FD_MS) { + Self::report_open_fd_stats(); + } sleep(SLEEP_INTERVAL); } } diff --git a/core/src/validator.rs b/core/src/validator.rs index 36a5cf334f..60f7447234 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -156,6 +156,7 @@ pub struct ValidatorConfig { pub no_os_network_stats_reporting: bool, pub no_os_cpu_stats_reporting: bool, pub no_os_disk_stats_reporting: bool, + pub no_os_open_fd_stats_reporting: bool, pub poh_pinned_cpu_core: usize, pub poh_hashes_per_batch: u64, pub process_ledger_before_services: bool, @@ -218,6 +219,7 @@ impl Default for ValidatorConfig { no_os_network_stats_reporting: true, no_os_cpu_stats_reporting: true, no_os_disk_stats_reporting: true, + no_os_open_fd_stats_reporting: true, poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE, poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH, process_ledger_before_services: false, @@ -500,6 +502,7 @@ impl Validator { !config.no_os_network_stats_reporting, !config.no_os_cpu_stats_reporting, !config.no_os_disk_stats_reporting, + !config.no_os_open_fd_stats_reporting, )); let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded(); diff --git a/ledger-tool/src/main.rs b/ledger-tool/src/main.rs index 0da06d7734..c2638ceb0c 100644 --- a/ledger-tool/src/main.rs +++ b/ledger-tool/src/main.rs @@ -2711,6 +2711,7 @@ fn main() { false, false, false, + false, ); accounts_index_config.index_limit_mb = if let Some(limit) = diff --git a/local-cluster/src/validator_configs.rs b/local-cluster/src/validator_configs.rs index 88a6977a48..eda2952f1f 100644 --- a/local-cluster/src/validator_configs.rs +++ b/local-cluster/src/validator_configs.rs @@ -46,6 +46,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig { no_os_network_stats_reporting: config.no_os_network_stats_reporting, no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting, no_os_disk_stats_reporting: config.no_os_disk_stats_reporting, + no_os_open_fd_stats_reporting: config.no_os_open_fd_stats_reporting, poh_pinned_cpu_core: config.poh_pinned_cpu_core, account_indexes: config.account_indexes.clone(), accounts_db_caching_enabled: config.accounts_db_caching_enabled, diff --git a/programs/sbf/Cargo.lock b/programs/sbf/Cargo.lock index cf3968139f..9e3a973f07 100644 --- a/programs/sbf/Cargo.lock +++ b/programs/sbf/Cargo.lock @@ -4491,6 +4491,7 @@ dependencies = [ "lru", "min-max-heap", "num_enum", + "procfs", "rand 0.7.3", "rand_chacha 0.2.2", "rayon", diff --git a/validator/src/cli.rs b/validator/src/cli.rs index af919e5374..3d050ab92a 100644 --- a/validator/src/cli.rs +++ b/validator/src/cli.rs @@ -513,6 +513,11 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> { .long("no-os-disk-stats-reporting") .help("Disable reporting of OS disk statistics.") ) + .arg( + Arg::with_name("no_os_open_fd_stats_reporting") + .long("no-os-open-fd-stats-reporting") + .help("Disable reporting of open file descriptors statistics for current process.") + ) .arg( Arg::with_name("accounts-hash-interval-slots") .long("accounts-hash-interval-slots") diff --git a/validator/src/main.rs b/validator/src/main.rs index b1f07fdb81..be3d77ef47 100644 --- a/validator/src/main.rs +++ b/validator/src/main.rs @@ -1141,6 +1141,7 @@ pub fn main() { no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"), no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"), no_os_disk_stats_reporting: matches.is_present("no_os_disk_stats_reporting"), + no_os_open_fd_stats_reporting: matches.is_present("no_os_open_fd_stats_reporting"), poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core") .unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE), poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")