add monitoring for open file descriptors stat

This commit is contained in:
Haoran Yi 2022-11-23 13:58:33 -06:00 committed by HaoranYi
parent 703511c3e8
commit e1ba5a2a63
9 changed files with 57 additions and 0 deletions

1
Cargo.lock generated
View File

@ -5202,6 +5202,7 @@ dependencies = [
"matches",
"min-max-heap",
"num_enum",
"procfs",
"rand 0.7.3",
"rand_chacha 0.2.2",
"raptorq",

View File

@ -31,6 +31,7 @@ log = "0.4.17"
lru = "0.7.7"
min-max-heap = "1.3.0"
num_enum = "0.5.7"
procfs = "0.14.1"
rand = "0.7.0"
rand_chacha = "0.2.2"
rayon = "1.5.3"

View File

@ -7,6 +7,7 @@ use num_enum::{IntoPrimitive, TryFromPrimitive};
#[cfg(target_os = "linux")]
use std::{fs::File, io::BufReader};
use {
procfs::process::{LimitValue, Process},
solana_sdk::timing::AtomicInterval,
std::{
collections::HashMap,
@ -29,6 +30,7 @@ const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
const SAMPLE_INTERVAL_MEM_MS: u64 = 5 * MS_PER_S;
const SAMPLE_INTERVAL_CPU_MS: u64 = 10 * MS_PER_S;
const SAMPLE_INTERVAL_DISK_MS: u64 = 5 * MS_PER_S;
const SAMPLE_INTERVAL_OPEN_FD_MS: u64 = 30 * MS_PER_S;
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
#[cfg(target_os = "linux")]
@ -392,6 +394,7 @@ impl SystemMonitorService {
report_os_network_stats: bool,
report_os_cpu_stats: bool,
report_os_disk_stats: bool,
report_os_open_fd_stats: bool,
) -> Self {
info!("Starting SystemMonitorService");
let thread_hdl = Builder::new()
@ -403,6 +406,7 @@ impl SystemMonitorService {
report_os_network_stats,
report_os_cpu_stats,
report_os_disk_stats,
report_os_open_fd_stats,
);
})
.unwrap();
@ -832,6 +836,40 @@ impl SystemMonitorService {
Self::report_cpuid_values();
}
fn get_open_fd_stats() -> Option<(usize, usize, usize)> {
let proc = Process::myself().ok()?;
let curr_num_open_fd = proc.fd_count().unwrap();
let max_open_fd_limit = proc.limits().unwrap().max_open_files;
let max_open_fd_soft_limit = match max_open_fd_limit.soft_limit {
LimitValue::Unlimited => usize::MAX,
LimitValue::Value(x) => x as usize,
};
let max_open_fd_hard_limit = match max_open_fd_limit.hard_limit {
LimitValue::Unlimited => usize::MAX,
LimitValue::Value(x) => x as usize,
};
Some((
curr_num_open_fd,
max_open_fd_soft_limit,
max_open_fd_hard_limit,
))
}
fn report_open_fd_stats() {
if let Some((curr_num_open_fd, max_open_fd_soft_limit, max_open_fd_hard_limit)) =
Self::get_open_fd_stats()
{
datapoint_info!(
"open-fd-stats",
("number_open_files", curr_num_open_fd, i64),
("max_open_files_hard_limit", max_open_fd_hard_limit, i64),
("max_open_files_soft_limit", max_open_fd_soft_limit, i64),
);
}
}
#[cfg(target_os = "linux")]
fn process_disk_stats(disk_stats: &mut Option<DiskStats>) {
match read_disk_stats() {
@ -973,6 +1011,7 @@ impl SystemMonitorService {
report_os_network_stats: bool,
report_os_cpu_stats: bool,
report_os_disk_stats: bool,
report_os_open_fd_stats: bool,
) {
let mut udp_stats = None;
let mut disk_stats = None;
@ -981,6 +1020,7 @@ impl SystemMonitorService {
let mem_timer = AtomicInterval::default();
let cpu_timer = AtomicInterval::default();
let disk_timer = AtomicInterval::default();
let open_fd_timer = AtomicInterval::default();
loop {
if exit.load(Ordering::Relaxed) {
@ -1003,6 +1043,9 @@ impl SystemMonitorService {
if report_os_disk_stats && disk_timer.should_update(SAMPLE_INTERVAL_DISK_MS) {
Self::process_disk_stats(&mut disk_stats);
}
if report_os_open_fd_stats && open_fd_timer.should_update(SAMPLE_INTERVAL_OPEN_FD_MS) {
Self::report_open_fd_stats();
}
sleep(SLEEP_INTERVAL);
}
}

View File

@ -156,6 +156,7 @@ pub struct ValidatorConfig {
pub no_os_network_stats_reporting: bool,
pub no_os_cpu_stats_reporting: bool,
pub no_os_disk_stats_reporting: bool,
pub no_os_open_fd_stats_reporting: bool,
pub poh_pinned_cpu_core: usize,
pub poh_hashes_per_batch: u64,
pub process_ledger_before_services: bool,
@ -218,6 +219,7 @@ impl Default for ValidatorConfig {
no_os_network_stats_reporting: true,
no_os_cpu_stats_reporting: true,
no_os_disk_stats_reporting: true,
no_os_open_fd_stats_reporting: true,
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
process_ledger_before_services: false,
@ -500,6 +502,7 @@ impl Validator {
!config.no_os_network_stats_reporting,
!config.no_os_cpu_stats_reporting,
!config.no_os_disk_stats_reporting,
!config.no_os_open_fd_stats_reporting,
));
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();

View File

@ -2711,6 +2711,7 @@ fn main() {
false,
false,
false,
false,
);
accounts_index_config.index_limit_mb = if let Some(limit) =

View File

@ -46,6 +46,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
no_os_disk_stats_reporting: config.no_os_disk_stats_reporting,
no_os_open_fd_stats_reporting: config.no_os_open_fd_stats_reporting,
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
account_indexes: config.account_indexes.clone(),
accounts_db_caching_enabled: config.accounts_db_caching_enabled,

View File

@ -4491,6 +4491,7 @@ dependencies = [
"lru",
"min-max-heap",
"num_enum",
"procfs",
"rand 0.7.3",
"rand_chacha 0.2.2",
"rayon",

View File

@ -513,6 +513,11 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
.long("no-os-disk-stats-reporting")
.help("Disable reporting of OS disk statistics.")
)
.arg(
Arg::with_name("no_os_open_fd_stats_reporting")
.long("no-os-open-fd-stats-reporting")
.help("Disable reporting of open file descriptors statistics for current process.")
)
.arg(
Arg::with_name("accounts-hash-interval-slots")
.long("accounts-hash-interval-slots")

View File

@ -1141,6 +1141,7 @@ pub fn main() {
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
no_os_disk_stats_reporting: matches.is_present("no_os_disk_stats_reporting"),
no_os_open_fd_stats_reporting: matches.is_present("no_os_open_fd_stats_reporting"),
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")