diff --git a/core/src/cluster_info.rs b/core/src/cluster_info.rs index 7607a42d12..add160c10c 100644 --- a/core/src/cluster_info.rs +++ b/core/src/cluster_info.rs @@ -32,9 +32,7 @@ use rand::SeedableRng; use rand::{thread_rng, Rng}; use rand_chacha::ChaChaRng; use rayon::prelude::*; -use solana_metrics::{ - datapoint_debug, inc_new_counter_debug, inc_new_counter_error, inc_new_counter_warn, -}; +use solana_metrics::{datapoint_debug, inc_new_counter_debug, inc_new_counter_error}; use solana_netutil::{ bind_in_range, bind_to, find_available_port_in_range, multi_bind_in_range, PortRange, }; @@ -719,9 +717,10 @@ impl ClusterInfo { last_err?; inc_new_counter_debug!("cluster_info-broadcast-max_idx", blobs_len); - if broadcast_table_len != 0 { - inc_new_counter_warn!("broadcast_service-num_peers", broadcast_table_len + 1); - } + datapoint_info!( + "cluster_info-num_nodes", + ("count", broadcast_table_len + 1, i64) + ); Ok(()) } diff --git a/core/src/replay_stage.rs b/core/src/replay_stage.rs index a96829a5c5..b4e5b5806a 100644 --- a/core/src/replay_stage.rs +++ b/core/src/replay_stage.rs @@ -144,14 +144,25 @@ impl ReplayStage { if let Some((_, bank, lockouts)) = votable.into_iter().last() { subscriptions.notify_subscribers(bank.slot(), &bank_forks); - if let Some(new_leader) = + if let Some(votable_leader) = leader_schedule_cache.slot_leader_at(bank.slot(), Some(&bank)) { Self::log_leader_change( &my_pubkey, bank.slot(), &mut current_leader, - &new_leader, + &votable_leader, + ); + } + + let next_slot = bank.slot() + 1; + if let Some(new_leader) = + leader_schedule_cache.slot_leader_at(next_slot, Some(&bank)) + { + datapoint_info!( + "replay_stage-new_leader", + ("slot", next_slot, i64), + ("leader", new_leader.to_string(), String), ); } @@ -261,7 +272,7 @@ impl ReplayStage { assert!(!poh_recorder.lock().unwrap().has_bank()); - let (reached_leader_tick, grace_ticks, poh_slot, parent_slot) = + let (reached_leader_tick, _grace_ticks, poh_slot, parent_slot) = poh_recorder.lock().unwrap().reached_leader_tick(); if !reached_leader_tick { @@ -303,10 +314,10 @@ impl ReplayStage { return; } - datapoint_warn!( + datapoint_info!( "replay_stage-new_leader", - ("count", poh_slot, i64), - ("grace", grace_ticks, i64) + ("slot", poh_slot, i64), + ("leader", next_leader.to_string(), String), ); let tpu_bank = bank_forks @@ -335,13 +346,16 @@ impl ReplayStage { } } + // Returns the replay result and the number of replayed transactions fn replay_blocktree_into_bank( bank: &Bank, blocktree: &Blocktree, progress: &mut HashMap, - ) -> Result<()> { + ) -> (Result<()>, usize) { + let mut tx_count = 0; let result = Self::load_blocktree_entries(bank, blocktree, progress).and_then(|(entries, num)| { + tx_count += entries.iter().map(|e| e.transactions.len()).sum::(); Self::replay_entries_into_bank(bank, entries, progress, num) }); @@ -354,7 +368,7 @@ impl ReplayStage { Self::mark_dead_slot(bank.slot(), blocktree, progress); } - result + (result, tx_count) } fn mark_dead_slot(slot: u64, blocktree: &Blocktree, progress: &mut HashMap) { @@ -499,6 +513,7 @@ impl ReplayStage { slot_full_senders: &[Sender<(u64, Pubkey)>], ) -> bool { let mut did_complete_bank = false; + let mut tx_count = 0; let active_banks = bank_forks.read().unwrap().active_banks(); trace!("active banks {:?}", active_banks); @@ -509,15 +524,16 @@ impl ReplayStage { } let bank = bank_forks.read().unwrap().get(*bank_slot).unwrap().clone(); - if bank.collector_id() != my_pubkey - && Self::is_replay_result_fatal(&Self::replay_blocktree_into_bank( - &bank, &blocktree, progress, - )) - { - trace!("replay_result_fatal slot {}", bank_slot); - // If the bank was corrupted, don't try to run the below logic to check if the - // bank is completed - continue; + if bank.collector_id() != my_pubkey { + let (replay_result, replay_tx_count) = + Self::replay_blocktree_into_bank(&bank, &blocktree, progress); + tx_count += replay_tx_count; + if Self::is_replay_result_fatal(&replay_result) { + trace!("replay_result_fatal slot {}", bank_slot); + // If the bank was corrupted, don't try to run the below logic to check if the + // bank is completed + continue; + } } assert_eq!(*bank_slot, bank.slot()); if bank.tick_height() == bank.max_tick_height() { @@ -532,6 +548,7 @@ impl ReplayStage { ); } } + inc_new_counter_info!("replay_stage-replay_transactions", tx_count); did_complete_bank } @@ -989,7 +1006,8 @@ mod test { progress.insert(bank0.slot(), ForkProgress::new(last_blockhash)); let blob = blob_to_insert(&last_blockhash); blocktree.insert_data_blobs(&[blob]).unwrap(); - let res = ReplayStage::replay_blocktree_into_bank(&bank0, &blocktree, &mut progress); + let (res, _tx_count) = + ReplayStage::replay_blocktree_into_bank(&bank0, &blocktree, &mut progress); // Check that the erroring bank was marked as dead in the progress map assert!(progress diff --git a/core/src/retransmit_stage.rs b/core/src/retransmit_stage.rs index 7334c44eaa..05a40dc33f 100644 --- a/core/src/retransmit_stage.rs +++ b/core/src/retransmit_stage.rs @@ -14,6 +14,7 @@ use rand::SeedableRng; use rand_chacha::ChaChaRng; use solana_metrics::{datapoint_info, inc_new_counter_error}; use solana_runtime::epoch_schedule::EpochSchedule; +use std::cmp; use std::net::UdpSocket; use std::sync::atomic::AtomicBool; use std::sync::mpsc::channel; @@ -39,12 +40,13 @@ fn retransmit( let r_bank = bank_forks.read().unwrap().working_bank(); let bank_epoch = r_bank.get_stakers_epoch(r_bank.slot()); + let mut peers_len = 0; for blob in &blobs { let (my_index, mut peers) = cluster_info.read().unwrap().shuffle_peers_and_index( staking_utils::staked_nodes_at_epoch(&r_bank, bank_epoch).as_ref(), ChaChaRng::from_seed(blob.read().unwrap().seed()), ); - + peers_len = cmp::max(peers_len, peers.len()); peers.remove(my_index); let (neighbors, children) = compute_retransmit_peers(DATA_PLANE_FANOUT, my_index, peers); @@ -58,6 +60,7 @@ fn retransmit( ClusterInfo::retransmit_to(&cluster_info, &children, blob, leader, sock, true)?; } } + datapoint_info!("cluster_info-num_nodes", ("count", peers_len, i64)); Ok(()) } diff --git a/core/src/validator.rs b/core/src/validator.rs index d2e821ef7a..0a4eb24784 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -276,7 +276,7 @@ impl Validator { &exit, ); - datapoint_info!("validator-new"); + datapoint_info!("validator-new", ("id", id.to_string(), String)); Self { id, gossip_service, diff --git a/metrics/scripts/adjust-dashboard-for-channel.py b/metrics/scripts/adjust-dashboard-for-channel.py index 152c67675c..22203066d5 100755 --- a/metrics/scripts/adjust-dashboard-for-channel.py +++ b/metrics/scripts/adjust-dashboard-for-channel.py @@ -55,7 +55,7 @@ if channel == 'local': 'multi': False, 'name': 'hostid', 'options': [], - 'query': 'SELECT DISTINCT(\"host_id\") FROM \"$testnet\".\"autogen\".\"validator-new\" ', + 'query': 'SELECT DISTINCT(\"id\") FROM \"$testnet\".\"autogen\".\"validator-new\" ', 'refresh': 2, 'regex': '', 'sort': 1, @@ -138,7 +138,7 @@ else: 'multi': False, 'name': 'hostid', 'options': [], - 'query': 'SELECT DISTINCT(\"host_id\") FROM \"$testnet\".\"autogen\".\"validator-new\" ', + 'query': 'SELECT DISTINCT(\"id\") FROM \"$testnet\".\"autogen\".\"validator-new\" ', 'refresh': 2, 'regex': '', 'sort': 1, diff --git a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json index 249ea9b556..cd85fba174 100644 --- a/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json +++ b/metrics/scripts/grafana-provisioning/dashboards/testnet-monitor.json @@ -103,7 +103,7 @@ "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "last", + "tableColumn": "mode", "targets": [ { "groupBy": [ @@ -123,7 +123,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT last(\"host_id\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE $timeFilter \n", + "query": "SELECT MODE(last) FROM ( SELECT last(\"leader\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE $timeFilter GROUP BY host_id )\n", "rawQuery": true, "refId": "A", "resultFormat": "table", @@ -236,7 +236,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT last(\"count\") FROM \"$testnet\".\"autogen\".\"broadcast_service-num_peers\" WHERE $timeFilter GROUP BY time(1s) \n\n", + "query": "SELECT LAST(median) FROM ( SELECT median(count) FROM \"$testnet\".\"autogen\".\"cluster_info-num_nodes\" WHERE $timeFilter AND count > 0 GROUP BY time(5s) )\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -325,7 +325,7 @@ ], "orderByTime": "ASC", "policy": "default", - "query": "SELECT count(\"count\") AS \"total\" FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)", + "query": "SELECT MEDIAN(\"host_count\") AS \"total\" FROM ( SELECT COUNT(\"count\") as host_count FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter GROUP BY time($__interval), host_id ) GROUP BY time($__interval) fill(0)", "rawQuery": true, "refId": "B", "resultFormat": "time_series", @@ -344,43 +344,6 @@ ] ], "tags": [] - }, - { - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT count(\"count\") AS \" \" FROM \"$testnet\".\"autogen\".\"validator-vote_sent\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] } ], "thresholds": [], @@ -502,7 +465,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT ROUND(MEAN(\"sum\")) FROM ( SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"banking_stage-record_transactions\" WHERE $timeFilter GROUP BY time(1s) )\n\n", + "query": "SELECT ROUND(MEAN(\"sum\")) FROM ( SELECT MEDIAN(tx_count) AS sum FROM (SELECT SUM(\"count\") AS tx_count FROM \"replay_stage-replay_transactions\" WHERE $timeFilter AND count > 0 GROUP BY time(1s), host_id) GROUP BY time(1s) )\n\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -614,7 +577,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT MAX(\"sum\") FROM ( SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"banking_stage-record_transactions\" WHERE $timeFilter GROUP BY time(1s) )\n\n", + "query": "SELECT MAX(\"median_sum\") FROM ( SELECT MEDIAN(tx_count) AS median_sum FROM (SELECT SUM(\"count\") AS tx_count FROM \"bank-process_transactions\" WHERE $timeFilter AND count > 0 GROUP BY time(1s), host_id) GROUP BY time(1s) )\n\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -726,7 +689,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT sum(\"count\") AS \"transactions\" FROM \"$testnet\".\"autogen\".\"banking_stage-record_transactions\" WHERE $timeFilter \n\n", + "query": "SELECT MEDIAN(tx_count) AS transactions FROM (SELECT SUM(\"count\") AS tx_count FROM \"bank-process_transactions\" WHERE $timeFilter GROUP BY host_id) WHERE tx_count > 0\n", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -818,7 +781,7 @@ "lineColor": "rgb(31, 120, 193)", "show": false }, - "tableColumn": "sum", + "tableColumn": "median", "targets": [ { "groupBy": [ @@ -838,7 +801,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT sum(\"count\") FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter \n", + "query": "SELECT MEDIAN(\"vote_count\") FROM ( SELECT sum(\"count\") as vote_count FROM \"$testnet\".\"autogen\".\"vote-native\" WHERE $timeFilter GROUP BY host_id )\n", "rawQuery": true, "refId": "A", "resultFormat": "table", @@ -1556,6 +1519,44 @@ ], "orderByTime": "ASC", "policy": "default", + "query": "select median(\"tx_count\") as \"transactions\" from (select sum(\"count\") / 2 as \"tx_count\" from \"bank-process_transactions\" where $timeFilter AND count > 0 GROUP BY time(2s), host_id) group by time(2s) fill(0)", + "rawQuery": true, + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", "query": "SELECT sum(\"count\") / 2 AS \"transactions\" FROM \"$testnet\".\"autogen\".\"banking_stage-record_transactions\" WHERE $timeFilter GROUP BY time(2s) FILL(0)\n", "rawQuery": true, "refId": "A", @@ -1575,6 +1576,44 @@ ] ], "tags": [] + }, + { + "groupBy": [ + { + "params": [ + "$__interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "hide": true, + "orderByTime": "ASC", + "policy": "default", + "query": "select median(\"tx_count\") as \"transactions\" from (select sum(\"count\") / 2 as \"tx_count\" from \"replay_stage-replay_transactions\" where $timeFilter AND count > 0 GROUP BY time(2s), host_id) group by time(2s) fill(0)", + "rawQuery": true, + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "value" + ], + "type": "field" + }, + { + "params": [], + "type": "mean" + } + ] + ], + "tags": [] } ], "thresholds": [], @@ -1655,44 +1694,6 @@ "stack": false, "steppedLine": false, "targets": [ - { - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "measurement": "cluster_info-vote-count", - "orderByTime": "ASC", - "policy": "autogen", - "query": "SELECT mean(\"total_peers\") as \"total peers\" FROM \"$testnet\".\"autogen\".\"vote_stage-peer_count\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)", - "rawQuery": true, - "refId": "A", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "count" - ], - "type": "field" - }, - { - "params": [], - "type": "sum" - } - ] - ], - "tags": [] - }, { "groupBy": [ { @@ -1711,45 +1712,7 @@ "hide": false, "orderByTime": "ASC", "policy": "default", - "query": "SELECT mean(\"valid_peers\") as \"valid peers\" FROM \"$testnet\".\"autogen\".\"vote_stage-peer_count\" WHERE $timeFilter GROUP BY time($__interval) FILL(0)", - "rawQuery": true, - "refId": "B", - "resultFormat": "time_series", - "select": [ - [ - { - "params": [ - "value" - ], - "type": "field" - }, - { - "params": [], - "type": "mean" - } - ] - ], - "tags": [] - }, - { - "groupBy": [ - { - "params": [ - "$__interval" - ], - "type": "time" - }, - { - "params": [ - "null" - ], - "type": "fill" - } - ], - "hide": false, - "orderByTime": "ASC", - "policy": "default", - "query": "SELECT mean(\"count\") AS \"peers\" FROM \"$testnet\".\"autogen\".\"broadcast_service-num_peers\" WHERE $timeFilter GROUP BY time(1s) FILL(0)", + "query": "SELECT median(\"count\") AS \"total\" FROM \"$testnet\".\"autogen\".\"cluster_info-num_nodes\" WHERE $timeFilter AND count > 0 GROUP BY time(5s)", "rawQuery": true, "refId": "C", "resultFormat": "time_series", @@ -7408,7 +7371,7 @@ "measurement": "cluster_info-vote-count", "orderByTime": "ASC", "policy": "autogen", - "query": "SELECT last(\"count\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE host_id =~ /$hostid/ AND $timeFilter GROUP BY time($__interval)", + "query": "SELECT median(\"slot\") FROM \"$testnet\".\"autogen\".\"replay_stage-new_leader\" WHERE $timeFilter GROUP BY time($__interval)", "rawQuery": true, "refId": "A", "resultFormat": "time_series", @@ -7432,7 +7395,7 @@ "thresholds": [], "timeFrom": null, "timeShift": null, - "title": "Leader Change ($hostid)", + "title": "Leader Change", "tooltip": { "shared": true, "sort": 0, @@ -8055,7 +8018,7 @@ "multi": false, "name": "hostid", "options": [], - "query": "SELECT DISTINCT(\"host_id\") FROM \"$testnet\".\"autogen\".\"counter-fullnode-new\" ", + "query": "SELECT DISTINCT(\"id\") FROM \"$testnet\".\"autogen\".\"validator-new\" ", "refresh": 2, "regex": "", "sort": 1, diff --git a/metrics/src/metrics.rs b/metrics/src/metrics.rs index e3631893cd..73ab695dd3 100644 --- a/metrics/src/metrics.rs +++ b/metrics/src/metrics.rs @@ -56,11 +56,6 @@ macro_rules! datapoint { (@point $name:expr) => { $crate::influxdb::Point::new(&$name) }; - ($name:expr) => { - if log_enabled!(log::Level::Debug) { - $crate::submit($crate::datapoint!(@point $name), log::Level::Debug); - } - }; ($name:expr, $($fields:tt)+) => { if log_enabled!(log::Level::Debug) { $crate::submit($crate::datapoint!(@point $name, $($fields)+), log::Level::Debug); @@ -245,7 +240,7 @@ impl MetricsAgent { let extra = influxdb::Point::new("metrics") .add_timestamp(timing::timestamp() as i64) - .add_field("host_id", influxdb::Value::String(HOST_ID.to_string())) + .add_tag("host_id", influxdb::Value::String(HOST_ID.to_string())) .add_field( "points_written", influxdb::Value::Integer(points_written as i64), @@ -342,7 +337,7 @@ impl MetricsAgent { } pub fn submit(&self, mut point: influxdb::Point, level: log::Level) { - point.add_field("host_id", influxdb::Value::String(HOST_ID.to_string())); + point.add_tag("host_id", influxdb::Value::String(HOST_ID.to_string())); if point.timestamp.is_none() { point.timestamp = Some(timing::timestamp() as i64); } @@ -383,7 +378,8 @@ fn get_singleton_agent() -> Arc> { /// Submits a new point from any thread. Note that points are internally queued /// and transmitted periodically in batches. -pub fn submit(point: influxdb::Point, level: log::Level) { +pub fn submit(mut point: influxdb::Point, level: log::Level) { + point.add_tag("host_id", influxdb::Value::String(HOST_ID.to_string())); let agent_mutex = get_singleton_agent(); let agent = agent_mutex.lock().unwrap(); agent.submit(point, level); @@ -435,6 +431,7 @@ pub fn set_panic_hook(program: &'static str) { thread::current().name().unwrap_or("?").to_string(), ), ) + .add_tag("host_id", influxdb::Value::String(HOST_ID.to_string())) // The 'one' field exists to give Kapacitor Alerts a numerical value // to filter on .add_field("one", influxdb::Value::Integer(1)) @@ -452,7 +449,6 @@ pub fn set_panic_hook(program: &'static str) { None => "?".to_string(), }), ) - .add_field("host_id", influxdb::Value::String(HOST_ID.to_string())) .to_owned(), Level::Error, ); @@ -610,7 +606,6 @@ mod test { } }; } - datapoint!("name"); datapoint!("name", ("field name", "test".to_string(), String)); datapoint!("name", ("field name", 12.34_f64, f64)); datapoint!("name", ("field name", true, bool)); diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index 6489992f4c..ddcd3f1df7 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -962,6 +962,7 @@ impl Bank { usize, ) { debug!("processing transactions: {}", txs.len()); + inc_new_counter_info!("bank-process_transactions", txs.len()); let mut error_counters = ErrorCounters::default(); let mut load_time = Measure::start("accounts_load");