From 032bee13abd12ab2882d475c78a4b3d9e2d5f7bd Mon Sep 17 00:00:00 2001 From: Greg Cusack Date: Wed, 29 Jun 2022 11:55:41 -0600 Subject: [PATCH] Add Gossip Loop metrics (#26195) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * add three gossip metrics measuring gossip loop times * add 5 metrics * rm space * rm space * Update SECURITY.md - fix nav link - add bounty split policy for duplicate reports * Add transaction index in slot to geyser plugin TransactionInfo (#25688) * Define shuffle to prep using same shuffle for multiple slices * Determine transaction indexes and plumb to execute_batch * Pair transaction_index with transaction in TransactionStatusService * Add new ReplicaTransactionInfoVersion * Plumb transaction_indexes through BankingStage * Prepare BankingStage to receive transaction indexes from PohRecorder * Determine transaction indexes in PohRecorder; add field to WorkingBank * Add PohRecorder::record unit test * Only pass starting_transaction_index around PohRecorder * Add helper structs to simplify test DashMap * Pass entry and starting-index into process_entries_with_callback together * Add tx-index checks to test_rebatch_transactions * Revert shuffle definition and use zip/unzip * Only zip/unzip if randomize * Add confirm_slot_entries test * Review nits * Add type alias to make sender docs more clear * Update SECURITY.md finish filling out the table.... * rpc: fix possible deadlock in rpc (#26051) * Add StatusCache::root_slot_deltas() and use it (#26170) * Remove InMemAccountsIndex::map() and use map_internal directly (#26189) * [quic]Decrement total_streams correctly (#26158) * remove comment * alphabetical metrics. no abbreviations * remove trailing white space * cargo fmt to update code format/readability Co-authored-by: Trent Nelson Co-authored-by: Tyera Eulberg Co-authored-by: Boqin Qin(秦 伯钦) Co-authored-by: Brooks Prumo Co-authored-by: Miles Obare --- gossip/src/cluster_info.rs | 11 +++++++++ gossip/src/cluster_info_metrics.rs | 36 ++++++++++++++++++++++++++++++ 2 files changed, 47 insertions(+) diff --git a/gossip/src/cluster_info.rs b/gossip/src/cluster_info.rs index f18b23af4c..9bae459501 100644 --- a/gossip/src/cluster_info.rs +++ b/gossip/src/cluster_info.rs @@ -1556,6 +1556,7 @@ impl ClusterInfo { sender: &PacketBatchSender, generate_pull_requests: bool, ) -> Result<(), GossipError> { + let _st = ScopedTimer::from(&self.stats.gossip_transmit_loop_time); let reqs = self.generate_new_gossip_requests( thread_pool, gossip_validators, @@ -1573,6 +1574,9 @@ impl ClusterInfo { .add_relaxed(packet_batch.len() as u64); sender.send(packet_batch)?; } + self.stats + .gossip_transmit_loop_iterations_since_last_report + .add_relaxed(1); Ok(()) } @@ -2435,6 +2439,9 @@ impl ClusterInfo { stakes, response_sender, ); + self.stats + .process_gossip_packets_iterations_since_last_report + .add_relaxed(1); Ok(()) } @@ -2490,6 +2497,7 @@ impl ClusterInfo { last_print: &mut Instant, should_check_duplicate_instance: bool, ) -> Result<(), GossipError> { + let _st = ScopedTimer::from(&self.stats.gossip_listen_loop_time); const RECV_TIMEOUT: Duration = Duration::from_secs(1); const SUBMIT_GOSSIP_STATS_INTERVAL: Duration = Duration::from_secs(2); let mut packets = VecDeque::from(receiver.recv_timeout(RECV_TIMEOUT)?); @@ -2528,6 +2536,9 @@ impl ClusterInfo { submit_gossip_stats(&self.stats, &self.gossip, &stakes); *last_print = Instant::now(); } + self.stats + .gossip_listen_loop_iterations_since_last_report + .add_relaxed(1); Ok(()) } diff --git a/gossip/src/cluster_info_metrics.rs b/gossip/src/cluster_info_metrics.rs index 27bd8b98b2..81e63a0163 100644 --- a/gossip/src/cluster_info_metrics.rs +++ b/gossip/src/cluster_info_metrics.rs @@ -103,6 +103,8 @@ pub struct GossipStats { pub(crate) get_epoch_duration_no_working_bank: Counter, pub(crate) get_votes: Counter, pub(crate) get_votes_count: Counter, + pub(crate) gossip_listen_loop_iterations_since_last_report: Counter, + pub(crate) gossip_listen_loop_time: Counter, pub(crate) gossip_packets_dropped_count: Counter, pub(crate) gossip_ping_msg_verify_fail: Counter, pub(crate) gossip_pong_msg_verify_fail: Counter, @@ -113,6 +115,8 @@ pub struct GossipStats { pub(crate) gossip_pull_request_verify_fail: Counter, pub(crate) gossip_pull_response_verify_fail: Counter, pub(crate) gossip_push_msg_verify_fail: Counter, + pub(crate) gossip_transmit_loop_iterations_since_last_report: Counter, + pub(crate) gossip_transmit_loop_time: Counter, pub(crate) handle_batch_ping_messages_time: Counter, pub(crate) handle_batch_pong_messages_time: Counter, pub(crate) handle_batch_prune_messages_time: Counter, @@ -137,6 +141,7 @@ pub struct GossipStats { pub(crate) packets_sent_pull_requests_count: Counter, pub(crate) packets_sent_pull_responses_count: Counter, pub(crate) packets_sent_push_messages_count: Counter, + pub(crate) process_gossip_packets_iterations_since_last_report: Counter, pub(crate) process_gossip_packets_time: Counter, pub(crate) process_prune: Counter, pub(crate) process_pull_requests: Counter, @@ -385,6 +390,37 @@ pub(crate) fn submit_gossip_stats( stats.gossip_pull_request_dropped_requests.clear(), i64 ), + ( + "gossip_transmit_loop_time", + stats.gossip_transmit_loop_time.clear(), + i64 + ), + ( + "gossip_transmit_loop_iterations_since_last_report", + stats + .gossip_transmit_loop_iterations_since_last_report + .clear(), + i64 + ), + ( + "gossip_listen_loop_time", + stats.gossip_listen_loop_time.clear(), + i64 + ), + ( + "gossip_listen_loop_iterations_since_last_report", + stats + .gossip_listen_loop_iterations_since_last_report + .clear(), + i64 + ), + ( + "process_gossip_packets_iterations_since_last_report", + stats + .process_gossip_packets_iterations_since_last_report + .clear(), + i64 + ), ); datapoint_info!( "cluster_info_stats4",