From abfb9961359e5c544a6dc143c9d7705164298c11 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Mon, 19 Sep 2022 18:16:04 +0000 Subject: [PATCH] tracks number of staked/stale/dead nodes in turbine cluster-nodes (#27915) --- core/src/broadcast_stage.rs | 9 +------ core/src/cluster_nodes.rs | 47 +++++++++++++++++++++--------------- core/src/retransmit_stage.rs | 5 ++-- 3 files changed, 31 insertions(+), 30 deletions(-) diff --git a/core/src/broadcast_stage.rs b/core/src/broadcast_stage.rs index 68729600d..61f34d388 100644 --- a/core/src/broadcast_stage.rs +++ b/core/src/broadcast_stage.rs @@ -381,14 +381,7 @@ fn update_peer_stats( last_datapoint_submit: &AtomicInterval, ) { if last_datapoint_submit.should_update(1000) { - let now = timestamp(); - let num_live_peers = cluster_nodes.num_peers_live(now); - let broadcast_len = cluster_nodes.num_peers() + 1; - datapoint_info!( - "cluster_info-num_nodes", - ("live_count", num_live_peers, i64), - ("broadcast_count", broadcast_len, i64) - ); + cluster_nodes.submit_metrics("cluster_nodes_broadcast", timestamp()); } } diff --git a/core/src/cluster_nodes.rs b/core/src/cluster_nodes.rs index 22fcc882c..1e1283e81 100644 --- a/core/src/cluster_nodes.rs +++ b/core/src/cluster_nodes.rs @@ -87,25 +87,34 @@ impl Node { } impl ClusterNodes { - pub(crate) fn num_peers(&self) -> usize { - self.nodes.len().saturating_sub(1) - } - - // A peer is considered live if they generated their contact info recently. - pub(crate) fn num_peers_live(&self, now: u64) -> usize { - self.nodes - .iter() - .filter(|node| node.pubkey() != self.pubkey) - .filter_map(|node| node.contact_info()) - .filter(|node| { - let elapsed = if node.wallclock < now { - now - node.wallclock - } else { - node.wallclock - now - }; - elapsed < CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS - }) - .count() + pub(crate) fn submit_metrics(&self, name: &'static str, now: u64) { + let mut num_nodes_dead = 0; + let mut num_nodes_staked = 0; + let mut num_nodes_stale = 0; + for node in &self.nodes { + if node.stake != 0u64 { + num_nodes_staked += 1; + } + match node.contact_info() { + None => { + num_nodes_dead += 1; + } + Some(node) => { + let age = now.saturating_sub(node.wallclock); + if age > CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS { + num_nodes_stale += 1; + } + } + } + } + num_nodes_stale += num_nodes_dead; + datapoint_info!( + name, + ("num_nodes", self.nodes.len(), i64), + ("num_nodes_dead", num_nodes_dead, i64), + ("num_nodes_staked", num_nodes_staked, i64), + ("num_nodes_stale", num_nodes_stale, i64), + ); } } diff --git a/core/src/retransmit_stage.rs b/core/src/retransmit_stage.rs index 408f68a0e..b0ebaad70 100644 --- a/core/src/retransmit_stage.rs +++ b/core/src/retransmit_stage.rs @@ -89,10 +89,9 @@ impl RetransmitStats { if self.since.elapsed() < SUBMIT_CADENCE { return; } - let num_peers = cluster_nodes_cache + cluster_nodes_cache .get(root_bank.slot(), root_bank, working_bank, cluster_info) - .num_peers(); - datapoint_info!("retransmit-num_nodes", ("count", num_peers, i64)); + .submit_metrics("cluster_nodes_retransmit", timestamp()); datapoint_info!( "retransmit-stage", ("total_time", self.total_time, i64),