filters for recent contact-infos when checking for live stake (#19204)

Contact-infos are saved to disk:
https://github.com/solana-labs/solana/blob/9dfeee299/gossip/src/cluster_info.rs#L1678-L1683

and restored on validator start-up:
https://github.com/solana-labs/solana/blob/9dfeee299/core/src/validator.rs#L450

Staked nodes entries will not expire until an epoch after. So when the
validator checks for online stake it is erroneously picking up
contact-infos restored from disk, which breaks the entire
wait-for-supermajority logic:
https://github.com/solana-labs/solana/blob/9dfeee299/core/src/validator.rs#L1515-L1561

This commit adds an extra check for the age of contact-info entries and
filters out old ones.
This commit is contained in:
behzad nouri 2021-08-13 12:12:40 +00:00 committed by GitHub
parent 0e0022d9ff
commit 7a789e0763
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 107 additions and 92 deletions

View File

@ -1,93 +1,98 @@
//! The `validator` module hosts all the validator microservices.
use crate::{
broadcast_stage::BroadcastStageType,
cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService},
cluster_info_vote_listener::VoteTracker,
completed_data_sets_service::CompletedDataSetsService,
consensus::{reconcile_blockstore_roots_with_tower, Tower},
cost_model::CostModel,
rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService},
sample_performance_service::SamplePerformanceService,
serve_repair::ServeRepair,
serve_repair_service::ServeRepairService,
sigverify,
snapshot_packager_service::{PendingSnapshotPackage, SnapshotPackagerService},
tower_storage::TowerStorage,
tpu::{Tpu, DEFAULT_TPU_COALESCE_MS},
tvu::{Sockets, Tvu, TvuConfig},
};
use crossbeam_channel::{bounded, unbounded};
use rand::{thread_rng, Rng};
use solana_entry::poh::compute_hash_time_ns;
use solana_gossip::{
cluster_info::{
ClusterInfo, Node, DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS,
DEFAULT_CONTACT_SAVE_INTERVAL_MILLIS,
use {
crate::{
broadcast_stage::BroadcastStageType,
cache_block_meta_service::{CacheBlockMetaSender, CacheBlockMetaService},
cluster_info_vote_listener::VoteTracker,
completed_data_sets_service::CompletedDataSetsService,
consensus::{reconcile_blockstore_roots_with_tower, Tower},
cost_model::CostModel,
rewards_recorder_service::{RewardsRecorderSender, RewardsRecorderService},
sample_performance_service::SamplePerformanceService,
serve_repair::ServeRepair,
serve_repair_service::ServeRepairService,
sigverify,
snapshot_packager_service::{PendingSnapshotPackage, SnapshotPackagerService},
tower_storage::TowerStorage,
tpu::{Tpu, DEFAULT_TPU_COALESCE_MS},
tvu::{Sockets, Tvu, TvuConfig},
},
contact_info::ContactInfo,
gossip_service::GossipService,
};
use solana_ledger::{
bank_forks_utils,
blockstore::{Blockstore, BlockstoreSignals, CompletedSlotsReceiver, PurgeType},
blockstore_db::BlockstoreRecoveryMode,
blockstore_processor::{self, TransactionStatusSender},
leader_schedule::FixedSchedule,
leader_schedule_cache::LeaderScheduleCache,
};
use solana_measure::measure::Measure;
use solana_metrics::datapoint_info;
use solana_poh::{
poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS},
poh_service::{self, PohService},
};
use solana_rpc::{
max_slots::MaxSlots,
optimistically_confirmed_bank_tracker::{
OptimisticallyConfirmedBank, OptimisticallyConfirmedBankTracker,
crossbeam_channel::{bounded, unbounded},
rand::{thread_rng, Rng},
solana_entry::poh::compute_hash_time_ns,
solana_gossip::{
cluster_info::{
ClusterInfo, Node, DEFAULT_CONTACT_DEBUG_INTERVAL_MILLIS,
DEFAULT_CONTACT_SAVE_INTERVAL_MILLIS,
},
contact_info::ContactInfo,
crds_gossip_pull::CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS,
gossip_service::GossipService,
},
solana_ledger::{
bank_forks_utils,
blockstore::{Blockstore, BlockstoreSignals, CompletedSlotsReceiver, PurgeType},
blockstore_db::BlockstoreRecoveryMode,
blockstore_processor::{self, TransactionStatusSender},
leader_schedule::FixedSchedule,
leader_schedule_cache::LeaderScheduleCache,
},
solana_measure::measure::Measure,
solana_metrics::datapoint_info,
solana_poh::{
poh_recorder::{PohRecorder, GRACE_TICKS_FACTOR, MAX_GRACE_SLOTS},
poh_service::{self, PohService},
},
solana_rpc::{
max_slots::MaxSlots,
optimistically_confirmed_bank_tracker::{
OptimisticallyConfirmedBank, OptimisticallyConfirmedBankTracker,
},
rpc::JsonRpcConfig,
rpc_completed_slots_service::RpcCompletedSlotsService,
rpc_pubsub_service::{PubSubConfig, PubSubService},
rpc_service::JsonRpcService,
rpc_subscriptions::RpcSubscriptions,
transaction_status_service::TransactionStatusService,
},
solana_runtime::{
accounts_db::AccountShrinkThreshold,
accounts_index::AccountSecondaryIndexes,
bank::Bank,
bank_forks::BankForks,
commitment::BlockCommitmentCache,
hardened_unpack::{open_genesis_config, MAX_GENESIS_ARCHIVE_UNPACKED_SIZE},
snapshot_archive_info::SnapshotArchiveInfoGetter,
snapshot_config::SnapshotConfig,
snapshot_utils,
},
solana_sdk::{
clock::Slot,
epoch_schedule::MAX_LEADER_SCHEDULE_EPOCH_OFFSET,
exit::Exit,
genesis_config::GenesisConfig,
hash::Hash,
pubkey::Pubkey,
shred_version::compute_shred_version,
signature::{Keypair, Signer},
timing::timestamp,
},
solana_streamer::socket::SocketAddrSpace,
solana_vote_program::vote_state::VoteState,
std::{
collections::{HashMap, HashSet},
net::SocketAddr,
ops::Deref,
path::{Path, PathBuf},
sync::{
atomic::{AtomicBool, AtomicU64, Ordering},
mpsc::Receiver,
Arc, Mutex, RwLock,
},
thread::{sleep, Builder, JoinHandle},
time::{Duration, Instant},
},
rpc::JsonRpcConfig,
rpc_completed_slots_service::RpcCompletedSlotsService,
rpc_pubsub_service::{PubSubConfig, PubSubService},
rpc_service::JsonRpcService,
rpc_subscriptions::RpcSubscriptions,
transaction_status_service::TransactionStatusService,
};
use solana_runtime::{
accounts_db::AccountShrinkThreshold,
accounts_index::AccountSecondaryIndexes,
bank::Bank,
bank_forks::BankForks,
commitment::BlockCommitmentCache,
hardened_unpack::{open_genesis_config, MAX_GENESIS_ARCHIVE_UNPACKED_SIZE},
snapshot_archive_info::SnapshotArchiveInfoGetter,
snapshot_config::SnapshotConfig,
snapshot_utils,
};
use solana_sdk::{
clock::Slot,
epoch_schedule::MAX_LEADER_SCHEDULE_EPOCH_OFFSET,
exit::Exit,
genesis_config::GenesisConfig,
hash::Hash,
pubkey::Pubkey,
shred_version::compute_shred_version,
signature::{Keypair, Signer},
timing::timestamp,
};
use solana_streamer::socket::SocketAddrSpace;
use solana_vote_program::vote_state::VoteState;
use std::{
collections::HashSet,
net::SocketAddr,
ops::Deref,
path::{Path, PathBuf},
sync::atomic::{AtomicBool, AtomicU64, Ordering},
sync::mpsc::Receiver,
sync::{Arc, Mutex, RwLock},
thread::{sleep, Builder, JoinHandle},
time::{Duration, Instant},
};
const MAX_COMPLETED_DATA_SETS_IN_CHANNEL: usize = 100_000;
@ -1521,7 +1526,20 @@ fn get_stake_percent_in_gossip(bank: &Bank, cluster_info: &ClusterInfo, log: boo
let mut offline_nodes = vec![];
let mut total_activated_stake = 0;
let all_tvu_peers = cluster_info.all_tvu_peers();
let now = timestamp();
// Nodes contact infos are saved to disk and restored on validator startup.
// Staked nodes entries will not expire until an epoch after. So it
// is necessary here to filter for recent entries to establish liveness.
let peers: HashMap<_, _> = cluster_info
.all_tvu_peers()
.into_iter()
.filter(|node| {
let age = now.saturating_sub(node.wallclock);
// Contact infos are refreshed twice during this period.
age < CRDS_GOSSIP_PULL_CRDS_TIMEOUT_MS
})
.map(|node| (node.id, node))
.collect();
let my_shred_version = cluster_info.my_shred_version();
let my_id = cluster_info.id();
@ -1537,10 +1555,7 @@ fn get_stake_percent_in_gossip(bank: &Bank, cluster_info: &ClusterInfo, log: boo
.map(|vote_state| vote_state.node_pubkey)
.unwrap_or_default();
if let Some(peer) = all_tvu_peers
.iter()
.find(|peer| peer.id == vote_state_node_pubkey)
{
if let Some(peer) = peers.get(&vote_state_node_pubkey) {
if peer.shred_version == my_shred_version {
trace!(
"observed {} in gossip, (activated_stake={})",

View File

@ -265,7 +265,7 @@ fn spy(
.into_iter()
.map(|x| x.0)
.collect::<Vec<_>>();
tvu_peers = spy_ref.all_tvu_peers().into_iter().collect::<Vec<_>>();
tvu_peers = spy_ref.all_tvu_peers();
let found_node_by_pubkey = if let Some(pubkey) = find_node_by_pubkey {
all_peers.iter().any(|x| x.id == pubkey)