regularly report network limits (#22563)
This commit is contained in:
parent
cca3dbc76d
commit
e7777281d6
|
@ -4751,6 +4751,7 @@ dependencies = [
|
||||||
"solana-vote-program",
|
"solana-vote-program",
|
||||||
"static_assertions",
|
"static_assertions",
|
||||||
"sys-info",
|
"sys-info",
|
||||||
|
"sysctl",
|
||||||
"systemstat",
|
"systemstat",
|
||||||
"tempfile",
|
"tempfile",
|
||||||
"thiserror",
|
"thiserror",
|
||||||
|
@ -6022,7 +6023,6 @@ dependencies = [
|
||||||
"solana-version",
|
"solana-version",
|
||||||
"solana-vote-program",
|
"solana-vote-program",
|
||||||
"symlink",
|
"symlink",
|
||||||
"sysctl",
|
|
||||||
"tikv-jemallocator",
|
"tikv-jemallocator",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
@ -80,6 +80,9 @@ solana-version = { path = "../version", version = "=1.10.0" }
|
||||||
static_assertions = "1.1.0"
|
static_assertions = "1.1.0"
|
||||||
systemstat = "0.1.10"
|
systemstat = "0.1.10"
|
||||||
|
|
||||||
|
[target."cfg(unix)".dependencies]
|
||||||
|
sysctl = "0.4.3"
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
rustc_version = "0.4"
|
rustc_version = "0.4"
|
||||||
|
|
||||||
|
|
|
@ -15,7 +15,10 @@ use {
|
||||||
};
|
};
|
||||||
|
|
||||||
const MS_PER_S: u64 = 1_000;
|
const MS_PER_S: u64 = 1_000;
|
||||||
|
const MS_PER_M: u64 = MS_PER_S * 60;
|
||||||
|
const MS_PER_H: u64 = MS_PER_M * 60;
|
||||||
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
|
const SAMPLE_INTERVAL_UDP_MS: u64 = 2 * MS_PER_S;
|
||||||
|
const SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS: u64 = MS_PER_H;
|
||||||
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
|
const SAMPLE_INTERVAL_MEM_MS: u64 = MS_PER_S;
|
||||||
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
const SLEEP_INTERVAL: Duration = Duration::from_millis(500);
|
||||||
|
|
||||||
|
@ -38,6 +41,30 @@ struct UdpStats {
|
||||||
ignored_multi: usize,
|
ignored_multi: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl UdpStats {
|
||||||
|
fn from_map(udp_stats: &HashMap<String, usize>) -> Self {
|
||||||
|
Self {
|
||||||
|
in_datagrams: *udp_stats.get("InDatagrams").unwrap_or(&0),
|
||||||
|
no_ports: *udp_stats.get("NoPorts").unwrap_or(&0),
|
||||||
|
in_errors: *udp_stats.get("InErrors").unwrap_or(&0),
|
||||||
|
out_datagrams: *udp_stats.get("OutDatagrams").unwrap_or(&0),
|
||||||
|
rcvbuf_errors: *udp_stats.get("RcvbufErrors").unwrap_or(&0),
|
||||||
|
sndbuf_errors: *udp_stats.get("SndbufErrors").unwrap_or(&0),
|
||||||
|
in_csum_errors: *udp_stats.get("InCsumErrors").unwrap_or(&0),
|
||||||
|
ignored_multi: *udp_stats.get("IgnoredMulti").unwrap_or(&0),
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn platform_id() -> String {
|
||||||
|
format!(
|
||||||
|
"{}/{}/{}",
|
||||||
|
std::env::consts::FAMILY,
|
||||||
|
std::env::consts::OS,
|
||||||
|
std::env::consts::ARCH
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
fn read_udp_stats(file_path: impl AsRef<Path>) -> Result<UdpStats, String> {
|
fn read_udp_stats(file_path: impl AsRef<Path>) -> Result<UdpStats, String> {
|
||||||
let file = File::open(file_path).map_err(|e| e.to_string())?;
|
let file = File::open(file_path).map_err(|e| e.to_string())?;
|
||||||
|
@ -73,16 +100,7 @@ fn parse_udp_stats(reader: &mut impl BufRead) -> Result<UdpStats, String> {
|
||||||
.map(|(label, val)| (label.to_string(), val.parse::<usize>().unwrap()))
|
.map(|(label, val)| (label.to_string(), val.parse::<usize>().unwrap()))
|
||||||
.collect();
|
.collect();
|
||||||
|
|
||||||
let stats = UdpStats {
|
let stats = UdpStats::from_map(&udp_stats);
|
||||||
in_datagrams: *udp_stats.get("InDatagrams").unwrap_or(&0),
|
|
||||||
no_ports: *udp_stats.get("NoPorts").unwrap_or(&0),
|
|
||||||
in_errors: *udp_stats.get("InErrors").unwrap_or(&0),
|
|
||||||
out_datagrams: *udp_stats.get("OutDatagrams").unwrap_or(&0),
|
|
||||||
rcvbuf_errors: *udp_stats.get("RcvbufErrors").unwrap_or(&0),
|
|
||||||
sndbuf_errors: *udp_stats.get("SndbufErrors").unwrap_or(&0),
|
|
||||||
in_csum_errors: *udp_stats.get("InCsumErrors").unwrap_or(&0),
|
|
||||||
ignored_multi: *udp_stats.get("IgnoredMulti").unwrap_or(&0),
|
|
||||||
};
|
|
||||||
|
|
||||||
Ok(stats)
|
Ok(stats)
|
||||||
}
|
}
|
||||||
|
@ -111,12 +129,98 @@ impl SystemMonitorService {
|
||||||
Self { thread_hdl }
|
Self { thread_hdl }
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
|
||||||
|
fn linux_get_recommended_network_limits() -> HashMap<&'static str, i64> {
|
||||||
|
// Reference: https://medium.com/@CameronSparr/increase-os-udp-buffers-to-improve-performance-51d167bb1360
|
||||||
|
let mut recommended_limits: HashMap<&str, i64> = HashMap::default();
|
||||||
|
recommended_limits.insert("net.core.rmem_max", 134217728);
|
||||||
|
recommended_limits.insert("net.core.rmem_default", 134217728);
|
||||||
|
recommended_limits.insert("net.core.wmem_max", 134217728);
|
||||||
|
recommended_limits.insert("net.core.wmem_default", 134217728);
|
||||||
|
recommended_limits.insert("vm.max_map_count", 1000000);
|
||||||
|
|
||||||
|
// Additionally collect the following limits
|
||||||
|
recommended_limits.insert("net.core.optmem_max", 0);
|
||||||
|
recommended_limits.insert("net.core.netdev_max_backlog", 0);
|
||||||
|
|
||||||
|
recommended_limits
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
fn linux_get_current_network_limits(
|
||||||
|
recommended_limits: &HashMap<&'static str, i64>,
|
||||||
|
) -> HashMap<&'static str, i64> {
|
||||||
|
use sysctl::Sysctl;
|
||||||
|
|
||||||
|
fn sysctl_read(name: &str) -> Result<String, sysctl::SysctlError> {
|
||||||
|
let ctl = sysctl::Ctl::new(name)?;
|
||||||
|
let val = ctl.value_string()?;
|
||||||
|
Ok(val)
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut current_limits: HashMap<&str, i64> = HashMap::default();
|
||||||
|
for (key, _) in recommended_limits.iter() {
|
||||||
|
let current_val = match sysctl_read(key) {
|
||||||
|
Ok(val) => val.parse::<i64>().unwrap(),
|
||||||
|
Err(e) => {
|
||||||
|
error!("Failed to query value for {}: {}", key, e);
|
||||||
|
-1
|
||||||
|
}
|
||||||
|
};
|
||||||
|
current_limits.insert(key, current_val);
|
||||||
|
}
|
||||||
|
current_limits
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg_attr(not(target_os = "linux"), allow(dead_code))]
|
||||||
|
fn linux_report_network_limits(
|
||||||
|
current_limits: &HashMap<&str, i64>,
|
||||||
|
recommended_limits: &HashMap<&'static str, i64>,
|
||||||
|
) -> bool {
|
||||||
|
let mut check_failed = false;
|
||||||
|
for (key, recommended_val) in recommended_limits.iter() {
|
||||||
|
let current_val = *current_limits.get(key).unwrap_or(&-1);
|
||||||
|
if current_val < *recommended_val {
|
||||||
|
datapoint_warn!("os-config", (key, current_val, i64));
|
||||||
|
warn!(
|
||||||
|
" {}: recommended={} current={}, too small",
|
||||||
|
key, recommended_val, current_val
|
||||||
|
);
|
||||||
|
check_failed = true;
|
||||||
|
} else {
|
||||||
|
datapoint_info!("os-config", (key, current_val, i64));
|
||||||
|
info!(
|
||||||
|
" {}: recommended={} current={}",
|
||||||
|
key, recommended_val, current_val
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if check_failed {
|
||||||
|
datapoint_warn!("os-config", ("network_limit_test_failed", 1, i64));
|
||||||
|
}
|
||||||
|
!check_failed
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(target_os = "linux"))]
|
||||||
|
pub fn check_os_network_limits() -> bool {
|
||||||
|
datapoint_info!("os-config", ("platform", platform_id(), String));
|
||||||
|
true
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(target_os = "linux")]
|
||||||
|
pub fn check_os_network_limits() -> bool {
|
||||||
|
datapoint_info!("os-config", ("platform", platform_id(), String));
|
||||||
|
let recommended_limits = Self::linux_get_recommended_network_limits();
|
||||||
|
let current_limits = Self::linux_get_current_network_limits(&recommended_limits);
|
||||||
|
Self::linux_report_network_limits(¤t_limits, &recommended_limits)
|
||||||
|
}
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
#[cfg(target_os = "linux")]
|
||||||
fn process_udp_stats(udp_stats: &mut Option<UdpStats>) {
|
fn process_udp_stats(udp_stats: &mut Option<UdpStats>) {
|
||||||
match read_udp_stats(PROC_NET_SNMP_PATH) {
|
match read_udp_stats(PROC_NET_SNMP_PATH) {
|
||||||
Ok(new_stats) => {
|
Ok(new_stats) => {
|
||||||
if let Some(old_stats) = udp_stats {
|
if let Some(old_stats) = udp_stats {
|
||||||
SystemMonitorService::report_udp_stats(old_stats, &new_stats);
|
Self::report_udp_stats(old_stats, &new_stats);
|
||||||
}
|
}
|
||||||
*udp_stats = Some(new_stats);
|
*udp_stats = Some(new_stats);
|
||||||
}
|
}
|
||||||
|
@ -229,22 +333,25 @@ impl SystemMonitorService {
|
||||||
|
|
||||||
pub fn run(exit: Arc<AtomicBool>, report_os_network_stats: bool) {
|
pub fn run(exit: Arc<AtomicBool>, report_os_network_stats: bool) {
|
||||||
let mut udp_stats = None;
|
let mut udp_stats = None;
|
||||||
|
let network_limits_timer = AtomicInterval::default();
|
||||||
let udp_timer = AtomicInterval::default();
|
let udp_timer = AtomicInterval::default();
|
||||||
let mem_timer = AtomicInterval::default();
|
let mem_timer = AtomicInterval::default();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if exit.load(Ordering::Relaxed) {
|
if exit.load(Ordering::Relaxed) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
if report_os_network_stats {
|
||||||
if report_os_network_stats && udp_timer.should_update(SAMPLE_INTERVAL_UDP_MS) {
|
if network_limits_timer.should_update(SAMPLE_INTERVAL_OS_NETWORK_LIMITS_MS) {
|
||||||
SystemMonitorService::process_udp_stats(&mut udp_stats);
|
Self::check_os_network_limits();
|
||||||
|
}
|
||||||
|
if udp_timer.should_update(SAMPLE_INTERVAL_UDP_MS) {
|
||||||
|
Self::process_udp_stats(&mut udp_stats);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
|
if mem_timer.should_update(SAMPLE_INTERVAL_MEM_MS) {
|
||||||
SystemMonitorService::report_mem_stats();
|
Self::report_mem_stats();
|
||||||
}
|
}
|
||||||
|
|
||||||
sleep(SLEEP_INTERVAL);
|
sleep(SLEEP_INTERVAL);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -60,7 +60,6 @@ jemallocator = {package = "tikv-jemallocator", version = "0.4.1", features = ["u
|
||||||
[target."cfg(unix)".dependencies]
|
[target."cfg(unix)".dependencies]
|
||||||
libc = "0.2.112"
|
libc = "0.2.112"
|
||||||
signal-hook = "0.3.13"
|
signal-hook = "0.3.13"
|
||||||
sysctl = "0.4.3"
|
|
||||||
|
|
||||||
[package.metadata.docs.rs]
|
[package.metadata.docs.rs]
|
||||||
targets = ["x86_64-unknown-linux-gnu"]
|
targets = ["x86_64-unknown-linux-gnu"]
|
||||||
|
|
|
@ -23,6 +23,7 @@ use {
|
||||||
},
|
},
|
||||||
solana_core::{
|
solana_core::{
|
||||||
ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS},
|
ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS},
|
||||||
|
system_monitor_service::SystemMonitorService,
|
||||||
tower_storage,
|
tower_storage,
|
||||||
tpu::DEFAULT_TPU_COALESCE_MS,
|
tpu::DEFAULT_TPU_COALESCE_MS,
|
||||||
validator::{is_snapshot_config_valid, Validator, ValidatorConfig, ValidatorStartProgress},
|
validator::{is_snapshot_config_valid, Validator, ValidatorConfig, ValidatorStartProgress},
|
||||||
|
@ -32,7 +33,6 @@ use {
|
||||||
contact_info::ContactInfo,
|
contact_info::ContactInfo,
|
||||||
},
|
},
|
||||||
solana_ledger::blockstore_db::BlockstoreRecoveryMode,
|
solana_ledger::blockstore_db::BlockstoreRecoveryMode,
|
||||||
solana_metrics::datapoint_info,
|
|
||||||
solana_perf::recycler::enable_recycler_warming,
|
solana_perf::recycler::enable_recycler_warming,
|
||||||
solana_poh::poh_service,
|
solana_poh::poh_service,
|
||||||
solana_replica_lib::accountsdb_repl_server::AccountsDbReplServiceConfig,
|
solana_replica_lib::accountsdb_repl_server::AccountsDbReplServiceConfig,
|
||||||
|
@ -411,85 +411,6 @@ fn get_cluster_shred_version(entrypoints: &[SocketAddr]) -> Option<u16> {
|
||||||
None
|
None
|
||||||
}
|
}
|
||||||
|
|
||||||
fn platform_id() -> String {
|
|
||||||
format!(
|
|
||||||
"{}/{}/{}",
|
|
||||||
std::env::consts::FAMILY,
|
|
||||||
std::env::consts::OS,
|
|
||||||
std::env::consts::ARCH
|
|
||||||
)
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(target_os = "linux")]
|
|
||||||
fn check_os_network_limits() {
|
|
||||||
use {solana_metrics::datapoint_warn, std::collections::HashMap, sysctl::Sysctl};
|
|
||||||
|
|
||||||
fn sysctl_read(name: &str) -> Result<String, sysctl::SysctlError> {
|
|
||||||
let ctl = sysctl::Ctl::new(name)?;
|
|
||||||
let val = ctl.value_string()?;
|
|
||||||
Ok(val)
|
|
||||||
}
|
|
||||||
let mut check_failed = false;
|
|
||||||
|
|
||||||
info!("Testing OS network limits:");
|
|
||||||
|
|
||||||
// Reference: https://medium.com/@CameronSparr/increase-os-udp-buffers-to-improve-performance-51d167bb1360
|
|
||||||
let mut recommended_limits: HashMap<&str, i64> = HashMap::default();
|
|
||||||
recommended_limits.insert("net.core.rmem_max", 134217728);
|
|
||||||
recommended_limits.insert("net.core.rmem_default", 134217728);
|
|
||||||
recommended_limits.insert("net.core.wmem_max", 134217728);
|
|
||||||
recommended_limits.insert("net.core.wmem_default", 134217728);
|
|
||||||
recommended_limits.insert("vm.max_map_count", 1000000);
|
|
||||||
|
|
||||||
// Additionally collect the following limits
|
|
||||||
recommended_limits.insert("net.core.optmem_max", 0);
|
|
||||||
recommended_limits.insert("net.core.netdev_max_backlog", 0);
|
|
||||||
|
|
||||||
let mut current_limits: HashMap<&str, i64> = HashMap::default();
|
|
||||||
for (key, _) in recommended_limits.iter() {
|
|
||||||
let current_val = match sysctl_read(key) {
|
|
||||||
Ok(val) => val.parse::<i64>().unwrap(),
|
|
||||||
Err(e) => {
|
|
||||||
error!("Failed to query value for {}: {}", key, e);
|
|
||||||
check_failed = true;
|
|
||||||
-1
|
|
||||||
}
|
|
||||||
};
|
|
||||||
current_limits.insert(key, current_val);
|
|
||||||
}
|
|
||||||
|
|
||||||
for (key, recommended_val) in recommended_limits.iter() {
|
|
||||||
let current_val = *current_limits.get(key).unwrap();
|
|
||||||
if current_val < *recommended_val {
|
|
||||||
datapoint_warn!("os-config", (key, current_val, i64));
|
|
||||||
warn!(
|
|
||||||
" {}: recommended={} current={}, too small",
|
|
||||||
key, recommended_val, current_val
|
|
||||||
);
|
|
||||||
check_failed = true;
|
|
||||||
} else {
|
|
||||||
datapoint_info!("os-config", (key, current_val, i64));
|
|
||||||
info!(
|
|
||||||
" {}: recommended={} current={}",
|
|
||||||
key, recommended_val, current_val
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
datapoint_info!("os-config", ("platform", platform_id(), String));
|
|
||||||
|
|
||||||
if check_failed {
|
|
||||||
datapoint_warn!("os-config", ("network_limit_test_failed", 1, i64));
|
|
||||||
warn!("OS network limit test failed. solana-sys-tuner may be used to configure OS network limits. Bypass check with --no-os-network-limits-test.");
|
|
||||||
} else {
|
|
||||||
info!("OS network limits test passed.");
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(not(target_os = "linux"))]
|
|
||||||
fn check_os_network_limits() {
|
|
||||||
datapoint_info!("os-config", ("platform", platform_id(), String));
|
|
||||||
}
|
|
||||||
|
|
||||||
pub fn main() {
|
pub fn main() {
|
||||||
let default_dynamic_port_range =
|
let default_dynamic_port_range =
|
||||||
&format!("{}-{}", VALIDATOR_PORT_RANGE.0, VALIDATOR_PORT_RANGE.1);
|
&format!("{}-{}", VALIDATOR_PORT_RANGE.0, VALIDATOR_PORT_RANGE.1);
|
||||||
|
@ -2516,7 +2437,11 @@ pub fn main() {
|
||||||
});
|
});
|
||||||
|
|
||||||
if !matches.is_present("no_os_network_limits_test") {
|
if !matches.is_present("no_os_network_limits_test") {
|
||||||
check_os_network_limits();
|
if SystemMonitorService::check_os_network_limits() {
|
||||||
|
info!("OS network limits test passed.");
|
||||||
|
} else {
|
||||||
|
eprintln!("OS network limit test failed. solana-sys-tuner may be used to configure OS network limits. Bypass check with --no-os-network-limits-test.");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut ledger_lock = ledger_lockfile(&ledger_path);
|
let mut ledger_lock = ledger_lockfile(&ledger_path);
|
||||||
|
|
Loading…
Reference in New Issue