From 7a789e076322ea6b02f507933f94b6aab7baa7a6 Mon Sep 17 00:00:00 2001 From: behzad nouri Date: Fri, 13 Aug 2021 12:12:40 +0000 Subject: [PATCH] filters for recent contact-infos when checking for live stake (#19204) Contact-infos are saved to disk: https://github.com/solana-labs/solana/blob/9dfeee299/gossip/src/cluster_info.rs#L1678-L1683 and restored on validator start-up: https://github.com/solana-labs/solana/blob/9dfeee299/core/src/validator.rs#L450 Staked nodes entries will not expire until an epoch after. So when the validator checks for online stake it is erroneously picking up contact-infos restored from disk, which breaks the entire wait-for-supermajority logic: https://github.com/solana-labs/solana/blob/9dfeee299/core/src/validator.rs#L1515-L1561 This commit adds an extra check for the age of contact-info entries and filters out old ones. --- core/src/validator.rs | 197 +++++++++++++++++++---------------- gossip/src/gossip_service.rs | 2 +- 2 files changed, 107 insertions(+), 92 deletions(-) diff --git a/core/src/validator.rs b/core/src/validator.rs index 555c3d8ed..c328b1cc7 100644 --- a/core/src/validator.rs +++ b/core/src/validator.rs @@ -1,93 +1,98 @@ //! The `validator` module hosts all the validator microservices. -use crate::{ - broadcast_stage::BroadcastStageType, - cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService}, - cluster_info_vote_listener::VoteTracker, - completed_data_sets_service::CompletedDataSetsService, - consensus::{reconcile_blockstore_roots_with_tower, Tower}, - cost_model::CostModel, - rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService}, - sample_performance_service::SamplePerformanceService, - serve_repair::ServeRepair, - serve_repair_service::ServeRepairService, - sigverify, - snapshot_packager_service::{PendingSnapshotPackage, SnapshotPackagerService}, - tower_storage::TowerStorage, - tpu::{Tpu, DEFAULT_TPU_COALESCE_MS}, - tvu::{Sockets, Tvu, TvuConfig}, -}; -use crossbeam_channel::{bounded, unbounded}; -use rand::{thread_rng, Rng}; -use solana_entry::poh::compute_hash_time_ns; -use solana_gossip::{ - cluster_info::{ - ClusterInfo, Node, DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS, - DEFAULT_CONTACT_SAVE_INTERVAL_MILLIS, +use { + crate::{ + broadcast_stage::BroadcastStageType, + cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService}, + cluster_info_vote_listener::VoteTracker, + completed_data_sets_service::CompletedDataSetsService, + consensus::{reconcile_blockstore_roots_with_tower, Tower}, + cost_model::CostModel, + rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService}, + sample_performance_service::SamplePerformanceService, + serve_repair::ServeRepair, + serve_repair_service::ServeRepairService, + sigverify, + snapshot_packager_service::{PendingSnapshotPackage, SnapshotPackagerService}, + tower_storage::TowerStorage, + tpu::{Tpu, DEFAULT_TPU_COALESCE_MS}, + tvu::{Sockets, Tvu, TvuConfig}, }, - contact_info::ContactInfo, - gossip_service::GossipService, -}; -use solana_ledger::{ - bank_forks_utils, - blockstore::{Blockstore, BlockstoreSignals, CompletedSlotsReceiver, PurgeType}, - blockstore_db::BlockstoreRecoveryMode, - blockstore_processor::{self, TransactionStatusSender}, - leader_schedule::FixedSchedule, - leader_schedule_cache::LeaderScheduleCache, -}; -use solana_measure::measure::Measure; -use solana_metrics::datapoint_info; -use solana_poh::{ - poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS}, - poh_service::{self, PohService}, -}; -use solana_rpc::{ - max_slots::MaxSlots, - optimistically_confirmed_bank_tracker::{ - OptimisticallyConfirmedBank, OptimisticallyConfirmedBankTracker, + crossbeam_channel::{bounded, unbounded}, + rand::{thread_rng, Rng}, + solana_entry::poh::compute_hash_time_ns, + solana_gossip::{ + cluster_info::{ + ClusterInfo, Node, DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS, + DEFAULT_CONTACT_SAVE_INTERVAL_MILLIS, + }, + contact_info::ContactInfo, + crds_gossip_pull::CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS, + gossip_service::GossipService, + }, + solana_ledger::{ + bank_forks_utils, + blockstore::{Blockstore, BlockstoreSignals, CompletedSlotsReceiver, PurgeType}, + blockstore_db::BlockstoreRecoveryMode, + blockstore_processor::{self, TransactionStatusSender}, + leader_schedule::FixedSchedule, + leader_schedule_cache::LeaderScheduleCache, + }, + solana_measure::measure::Measure, + solana_metrics::datapoint_info, + solana_poh::{ + poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS}, + poh_service::{self, PohService}, + }, + solana_rpc::{ + max_slots::MaxSlots, + optimistically_confirmed_bank_tracker::{ + OptimisticallyConfirmedBank, OptimisticallyConfirmedBankTracker, + }, + rpc::JsonRpcConfig, + rpc_completed_slots_service::RpcCompletedSlotsService, + rpc_pubsub_service::{PubSubConfig, PubSubService}, + rpc_service::JsonRpcService, + rpc_subscriptions::RpcSubscriptions, + transaction_status_service::TransactionStatusService, + }, + solana_runtime::{ + accounts_db::AccountShrinkThreshold, + accounts_index::AccountSecondaryIndexes, + bank::Bank, + bank_forks::BankForks, + commitment::BlockCommitmentCache, + hardened_unpack::{open_genesis_config, MAX_GENESIS_ARCHIVE_UNPACKED_SIZE}, + snapshot_archive_info::SnapshotArchiveInfoGetter, + snapshot_config::SnapshotConfig, + snapshot_utils, + }, + solana_sdk::{ + clock::Slot, + epoch_schedule::MAX_LEADER_SCHEDULE_EPOCH_OFFSET, + exit::Exit, + genesis_config::GenesisConfig, + hash::Hash, + pubkey::Pubkey, + shred_version::compute_shred_version, + signature::{Keypair, Signer}, + timing::timestamp, + }, + solana_streamer::socket::SocketAddrSpace, + solana_vote_program::vote_state::VoteState, + std::{ + collections::{HashMap, HashSet}, + net::SocketAddr, + ops::Deref, + path::{Path, PathBuf}, + sync::{ + atomic::{AtomicBool, AtomicU64, Ordering}, + mpsc::Receiver, + Arc, Mutex, RwLock, + }, + thread::{sleep, Builder, JoinHandle}, + time::{Duration, Instant}, }, - rpc::JsonRpcConfig, - rpc_completed_slots_service::RpcCompletedSlotsService, - rpc_pubsub_service::{PubSubConfig, PubSubService}, - rpc_service::JsonRpcService, - rpc_subscriptions::RpcSubscriptions, - transaction_status_service::TransactionStatusService, -}; -use solana_runtime::{ - accounts_db::AccountShrinkThreshold, - accounts_index::AccountSecondaryIndexes, - bank::Bank, - bank_forks::BankForks, - commitment::BlockCommitmentCache, - hardened_unpack::{open_genesis_config, MAX_GENESIS_ARCHIVE_UNPACKED_SIZE}, - snapshot_archive_info::SnapshotArchiveInfoGetter, - snapshot_config::SnapshotConfig, - snapshot_utils, -}; -use solana_sdk::{ - clock::Slot, - epoch_schedule::MAX_LEADER_SCHEDULE_EPOCH_OFFSET, - exit::Exit, - genesis_config::GenesisConfig, - hash::Hash, - pubkey::Pubkey, - shred_version::compute_shred_version, - signature::{Keypair, Signer}, - timing::timestamp, -}; -use solana_streamer::socket::SocketAddrSpace; -use solana_vote_program::vote_state::VoteState; -use std::{ - collections::HashSet, - net::SocketAddr, - ops::Deref, - path::{Path, PathBuf}, - sync::atomic::{AtomicBool, AtomicU64, Ordering}, - sync::mpsc::Receiver, - sync::{Arc, Mutex, RwLock}, - thread::{sleep, Builder, JoinHandle}, - time::{Duration, Instant}, }; const MAX_COMPLETED_DATA_SETS_IN_CHANNEL: usize = 100_000; @@ -1521,7 +1526,20 @@ fn get_stake_percent_in_gossip(bank: &Bank, cluster_info: &ClusterInfo, log: boo let mut offline_nodes = vec![]; let mut total_activated_stake = 0; - let all_tvu_peers = cluster_info.all_tvu_peers(); + let now = timestamp(); + // Nodes contact infos are saved to disk and restored on validator startup. + // Staked nodes entries will not expire until an epoch after. So it + // is necessary here to filter for recent entries to establish liveness. + let peers: HashMap<_, _> = cluster_info + .all_tvu_peers() + .into_iter() + .filter(|node| { + let age = now.saturating_sub(node.wallclock); + // Contact infos are refreshed twice during this period. + age < CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS + }) + .map(|node| (node.id, node)) + .collect(); let my_shred_version = cluster_info.my_shred_version(); let my_id = cluster_info.id(); @@ -1537,10 +1555,7 @@ fn get_stake_percent_in_gossip(bank: &Bank, cluster_info: &ClusterInfo, log: boo .map(|vote_state| vote_state.node_pubkey) .unwrap_or_default(); - if let Some(peer) = all_tvu_peers - .iter() - .find(|peer| peer.id == vote_state_node_pubkey) - { + if let Some(peer) = peers.get(&vote_state_node_pubkey) { if peer.shred_version == my_shred_version { trace!( "observed {} in gossip, (activated_stake={})", diff --git a/gossip/src/gossip_service.rs b/gossip/src/gossip_service.rs index 3cb43e865..79dc61c54 100644 --- a/gossip/src/gossip_service.rs +++ b/gossip/src/gossip_service.rs @@ -265,7 +265,7 @@ fn spy( .into_iter() .map(|x| x.0) .collect::>(); - tvu_peers = spy_ref.all_tvu_peers().into_iter().collect::>(); + tvu_peers = spy_ref.all_tvu_peers(); let found_node_by_pubkey = if let Some(pubkey) = find_node_by_pubkey { all_peers.iter().any(|x| x.id == pubkey)