add monitoring for open file descriptors stat
This commit is contained in:
parent
703511c3e8
commit
e1ba5a2a63
|
@ -5202,6 +5202,7 @@ dependencies = [
|
||||||
"matches",
|
"matches",
|
||||||
"min-max-heap",
|
"min-max-heap",
|
||||||
"num_enum",
|
"num_enum",
|
||||||
|
"procfs",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rand_chacha 0.2.2",
|
"rand_chacha 0.2.2",
|
||||||
"raptorq",
|
"raptorq",
|
||||||
|
|
|
@ -31,6 +31,7 @@ log = "0.4.17"
|
||||||
lru = "0.7.7"
|
lru = "0.7.7"
|
||||||
min-max-heap = "1.3.0"
|
min-max-heap = "1.3.0"
|
||||||
num_enum = "0.5.7"
|
num_enum = "0.5.7"
|
||||||
|
procfs = "0.14.1"
|
||||||
rand = "0.7.0"
|
rand = "0.7.0"
|
||||||
rand_chacha = "0.2.2"
|
rand_chacha = "0.2.2"
|
||||||
rayon = "1.5.3"
|
rayon = "1.5.3"
|
||||||
|
|
|
@ -7,6 +7,7 @@ use num_enum::{IntoPrimitive, TryFromPrimitive};
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
use std::{fs::File, io::BufReader};
|
use std::{fs::File, io::BufReader};
|
||||||
use {
|
use {
|
||||||
|
procfs::process::{LimitValue, Process},
|
||||||
solana_sdk::timing::AtomicInterval,
|
solana_sdk::timing::AtomicInterval,
|
||||||
std::{
|
std::{
|
||||||
collections::HashMap,
|
collections::HashMap,
|
||||||
|
@ -29,6 +30,7 @@ const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
|
||||||
const SAMPLE_INTERVAL_MEM_MS: u64 = 5 * MS_PER_S;
|
const SAMPLE_INTERVAL_MEM_MS: u64 = 5 * MS_PER_S;
|
||||||
const SAMPLE_INTERVAL_CPU_MS: u64 = 10 * MS_PER_S;
|
const SAMPLE_INTERVAL_CPU_MS: u64 = 10 * MS_PER_S;
|
||||||
const SAMPLE_INTERVAL_DISK_MS: u64 = 5 * MS_PER_S;
|
const SAMPLE_INTERVAL_DISK_MS: u64 = 5 * MS_PER_S;
|
||||||
|
const SAMPLE_INTERVAL_OPEN_FD_MS: u64 = 30 * MS_PER_S;
|
||||||
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
|
@ -392,6 +394,7 @@ impl SystemMonitorService {
|
||||||
report_os_network_stats: bool,
|
report_os_network_stats: bool,
|
||||||
report_os_cpu_stats: bool,
|
report_os_cpu_stats: bool,
|
||||||
report_os_disk_stats: bool,
|
report_os_disk_stats: bool,
|
||||||
|
report_os_open_fd_stats: bool,
|
||||||
) -> Self {
|
) -> Self {
|
||||||
info!("Starting SystemMonitorService");
|
info!("Starting SystemMonitorService");
|
||||||
let thread_hdl = Builder::new()
|
let thread_hdl = Builder::new()
|
||||||
|
@ -403,6 +406,7 @@ impl SystemMonitorService {
|
||||||
report_os_network_stats,
|
report_os_network_stats,
|
||||||
report_os_cpu_stats,
|
report_os_cpu_stats,
|
||||||
report_os_disk_stats,
|
report_os_disk_stats,
|
||||||
|
report_os_open_fd_stats,
|
||||||
);
|
);
|
||||||
})
|
})
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
@ -832,6 +836,40 @@ impl SystemMonitorService {
|
||||||
Self::report_cpuid_values();
|
Self::report_cpuid_values();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_open_fd_stats() -> Option<(usize, usize, usize)> {
|
||||||
|
let proc = Process::myself().ok()?;
|
||||||
|
let curr_num_open_fd = proc.fd_count().unwrap();
|
||||||
|
let max_open_fd_limit = proc.limits().unwrap().max_open_files;
|
||||||
|
|
||||||
|
let max_open_fd_soft_limit = match max_open_fd_limit.soft_limit {
|
||||||
|
LimitValue::Unlimited => usize::MAX,
|
||||||
|
LimitValue::Value(x) => x as usize,
|
||||||
|
};
|
||||||
|
let max_open_fd_hard_limit = match max_open_fd_limit.hard_limit {
|
||||||
|
LimitValue::Unlimited => usize::MAX,
|
||||||
|
LimitValue::Value(x) => x as usize,
|
||||||
|
};
|
||||||
|
|
||||||
|
Some((
|
||||||
|
curr_num_open_fd,
|
||||||
|
max_open_fd_soft_limit,
|
||||||
|
max_open_fd_hard_limit,
|
||||||
|
))
|
||||||
|
}
|
||||||
|
|
||||||
|
fn report_open_fd_stats() {
|
||||||
|
if let Some((curr_num_open_fd, max_open_fd_soft_limit, max_open_fd_hard_limit)) =
|
||||||
|
Self::get_open_fd_stats()
|
||||||
|
{
|
||||||
|
datapoint_info!(
|
||||||
|
"open-fd-stats",
|
||||||
|
("number_open_files", curr_num_open_fd, i64),
|
||||||
|
("max_open_files_hard_limit", max_open_fd_hard_limit, i64),
|
||||||
|
("max_open_files_soft_limit", max_open_fd_soft_limit, i64),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
fn process_disk_stats(disk_stats: &mut Option<DiskStats>) {
|
fn process_disk_stats(disk_stats: &mut Option<DiskStats>) {
|
||||||
match read_disk_stats() {
|
match read_disk_stats() {
|
||||||
|
@ -973,6 +1011,7 @@ impl SystemMonitorService {
|
||||||
report_os_network_stats: bool,
|
report_os_network_stats: bool,
|
||||||
report_os_cpu_stats: bool,
|
report_os_cpu_stats: bool,
|
||||||
report_os_disk_stats: bool,
|
report_os_disk_stats: bool,
|
||||||
|
report_os_open_fd_stats: bool,
|
||||||
) {
|
) {
|
||||||
let mut udp_stats = None;
|
let mut udp_stats = None;
|
||||||
let mut disk_stats = None;
|
let mut disk_stats = None;
|
||||||
|
@ -981,6 +1020,7 @@ impl SystemMonitorService {
|
||||||
let mem_timer = AtomicInterval::default();
|
let mem_timer = AtomicInterval::default();
|
||||||
let cpu_timer = AtomicInterval::default();
|
let cpu_timer = AtomicInterval::default();
|
||||||
let disk_timer = AtomicInterval::default();
|
let disk_timer = AtomicInterval::default();
|
||||||
|
let open_fd_timer = AtomicInterval::default();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if exit.load(Ordering::Relaxed) {
|
if exit.load(Ordering::Relaxed) {
|
||||||
|
@ -1003,6 +1043,9 @@ impl SystemMonitorService {
|
||||||
if report_os_disk_stats && disk_timer.should_update(SAMPLE_INTERVAL_DISK_MS) {
|
if report_os_disk_stats && disk_timer.should_update(SAMPLE_INTERVAL_DISK_MS) {
|
||||||
Self::process_disk_stats(&mut disk_stats);
|
Self::process_disk_stats(&mut disk_stats);
|
||||||
}
|
}
|
||||||
|
if report_os_open_fd_stats && open_fd_timer.should_update(SAMPLE_INTERVAL_OPEN_FD_MS) {
|
||||||
|
Self::report_open_fd_stats();
|
||||||
|
}
|
||||||
sleep(SLEEP_INTERVAL);
|
sleep(SLEEP_INTERVAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -156,6 +156,7 @@ pub struct ValidatorConfig {
|
||||||
pub no_os_network_stats_reporting: bool,
|
pub no_os_network_stats_reporting: bool,
|
||||||
pub no_os_cpu_stats_reporting: bool,
|
pub no_os_cpu_stats_reporting: bool,
|
||||||
pub no_os_disk_stats_reporting: bool,
|
pub no_os_disk_stats_reporting: bool,
|
||||||
|
pub no_os_open_fd_stats_reporting: bool,
|
||||||
pub poh_pinned_cpu_core: usize,
|
pub poh_pinned_cpu_core: usize,
|
||||||
pub poh_hashes_per_batch: u64,
|
pub poh_hashes_per_batch: u64,
|
||||||
pub process_ledger_before_services: bool,
|
pub process_ledger_before_services: bool,
|
||||||
|
@ -218,6 +219,7 @@ impl Default for ValidatorConfig {
|
||||||
no_os_network_stats_reporting: true,
|
no_os_network_stats_reporting: true,
|
||||||
no_os_cpu_stats_reporting: true,
|
no_os_cpu_stats_reporting: true,
|
||||||
no_os_disk_stats_reporting: true,
|
no_os_disk_stats_reporting: true,
|
||||||
|
no_os_open_fd_stats_reporting: true,
|
||||||
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
|
poh_pinned_cpu_core: poh_service::DEFAULT_PINNED_CPU_CORE,
|
||||||
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
|
poh_hashes_per_batch: poh_service::DEFAULT_HASHES_PER_BATCH,
|
||||||
process_ledger_before_services: false,
|
process_ledger_before_services: false,
|
||||||
|
@ -500,6 +502,7 @@ impl Validator {
|
||||||
!config.no_os_network_stats_reporting,
|
!config.no_os_network_stats_reporting,
|
||||||
!config.no_os_cpu_stats_reporting,
|
!config.no_os_cpu_stats_reporting,
|
||||||
!config.no_os_disk_stats_reporting,
|
!config.no_os_disk_stats_reporting,
|
||||||
|
!config.no_os_open_fd_stats_reporting,
|
||||||
));
|
));
|
||||||
|
|
||||||
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();
|
let (poh_timing_point_sender, poh_timing_point_receiver) = unbounded();
|
||||||
|
|
|
@ -2711,6 +2711,7 @@ fn main() {
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
false,
|
false,
|
||||||
|
false,
|
||||||
);
|
);
|
||||||
|
|
||||||
accounts_index_config.index_limit_mb = if let Some(limit) =
|
accounts_index_config.index_limit_mb = if let Some(limit) =
|
||||||
|
|
|
@ -46,6 +46,7 @@ pub fn safe_clone_config(config: &ValidatorConfig) -> ValidatorConfig {
|
||||||
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
|
no_os_network_stats_reporting: config.no_os_network_stats_reporting,
|
||||||
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
|
no_os_cpu_stats_reporting: config.no_os_cpu_stats_reporting,
|
||||||
no_os_disk_stats_reporting: config.no_os_disk_stats_reporting,
|
no_os_disk_stats_reporting: config.no_os_disk_stats_reporting,
|
||||||
|
no_os_open_fd_stats_reporting: config.no_os_open_fd_stats_reporting,
|
||||||
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
|
poh_pinned_cpu_core: config.poh_pinned_cpu_core,
|
||||||
account_indexes: config.account_indexes.clone(),
|
account_indexes: config.account_indexes.clone(),
|
||||||
accounts_db_caching_enabled: config.accounts_db_caching_enabled,
|
accounts_db_caching_enabled: config.accounts_db_caching_enabled,
|
||||||
|
|
|
@ -4491,6 +4491,7 @@ dependencies = [
|
||||||
"lru",
|
"lru",
|
||||||
"min-max-heap",
|
"min-max-heap",
|
||||||
"num_enum",
|
"num_enum",
|
||||||
|
"procfs",
|
||||||
"rand 0.7.3",
|
"rand 0.7.3",
|
||||||
"rand_chacha 0.2.2",
|
"rand_chacha 0.2.2",
|
||||||
"rayon",
|
"rayon",
|
||||||
|
|
|
@ -513,6 +513,11 @@ pub fn app<'a>(version: &'a str, default_args: &'a DefaultArgs) -> App<'a, 'a> {
|
||||||
.long("no-os-disk-stats-reporting")
|
.long("no-os-disk-stats-reporting")
|
||||||
.help("Disable reporting of OS disk statistics.")
|
.help("Disable reporting of OS disk statistics.")
|
||||||
)
|
)
|
||||||
|
.arg(
|
||||||
|
Arg::with_name("no_os_open_fd_stats_reporting")
|
||||||
|
.long("no-os-open-fd-stats-reporting")
|
||||||
|
.help("Disable reporting of open file descriptors statistics for current process.")
|
||||||
|
)
|
||||||
.arg(
|
.arg(
|
||||||
Arg::with_name("accounts-hash-interval-slots")
|
Arg::with_name("accounts-hash-interval-slots")
|
||||||
.long("accounts-hash-interval-slots")
|
.long("accounts-hash-interval-slots")
|
||||||
|
|
|
@ -1141,6 +1141,7 @@ pub fn main() {
|
||||||
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
|
no_os_network_stats_reporting: matches.is_present("no_os_network_stats_reporting"),
|
||||||
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
|
no_os_cpu_stats_reporting: matches.is_present("no_os_cpu_stats_reporting"),
|
||||||
no_os_disk_stats_reporting: matches.is_present("no_os_disk_stats_reporting"),
|
no_os_disk_stats_reporting: matches.is_present("no_os_disk_stats_reporting"),
|
||||||
|
no_os_open_fd_stats_reporting: matches.is_present("no_os_open_fd_stats_reporting"),
|
||||||
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
|
poh_pinned_cpu_core: value_of(&matches, "poh_pinned_cpu_core")
|
||||||
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
|
.unwrap_or(poh_service::DEFAULT_PINNED_CPU_CORE),
|
||||||
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")
|
poh_hashes_per_batch: value_of(&matches, "poh_hashes_per_batch")
|
||||||
|
|
Loading…
Reference in New Issue