decouples shreds sig-verify from tpu vote and transaction packets (#26300)

Shreds have different workload and traffic pattern from TPU vote and
transaction packets. Some of recent changes to SigVerifyStage are not
suitable or at least optimal for shreds sig-verify; e.g. random discard,
dedup with false positives, discard excess by IP-address, ...

SigVerifier trait is meant to abstract out the distinctions between the
two pipelines, but in practice it has led to more verbose and convoluted
code.

This commit discards SigVerifier implementation for shreds sig-verify
and instead provides a standalone stage for verifying shreds signatures.
This commit is contained in:
behzad nouri 2022-07-07 11:13:13 +00:00 committed by GitHub
parent 9723a33d2f
commit 6f4838719b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 193 additions and 127 deletions

View File

@ -28,7 +28,7 @@ impl ShredFetchStage {
// updates packets received on a channel and sends them on another channel // updates packets received on a channel and sends them on another channel
fn modify_packets( fn modify_packets(
recvr: PacketBatchReceiver, recvr: PacketBatchReceiver,
sendr: Sender<Vec<PacketBatch>>, sendr: Sender<PacketBatch>,
bank_forks: &RwLock<BankForks>, bank_forks: &RwLock<BankForks>,
shred_version: u16, shred_version: u16,
name: &'static str, name: &'static str,
@ -46,7 +46,7 @@ impl ShredFetchStage {
let mut stats = ShredFetchStats::default(); let mut stats = ShredFetchStats::default();
let mut packet_hasher = PacketHasher::default(); let mut packet_hasher = PacketHasher::default();
while let Some(mut packet_batch) = recvr.iter().next() { for mut packet_batch in recvr {
if last_updated.elapsed().as_millis() as u64 > DEFAULT_MS_PER_SLOT { if last_updated.elapsed().as_millis() as u64 > DEFAULT_MS_PER_SLOT {
last_updated = Instant::now(); last_updated = Instant::now();
packet_hasher.reset(); packet_hasher.reset();
@ -79,7 +79,7 @@ impl ShredFetchStage {
} }
} }
stats.maybe_submit(name, STATS_SUBMIT_CADENCE); stats.maybe_submit(name, STATS_SUBMIT_CADENCE);
if sendr.send(vec![packet_batch]).is_err() { if sendr.send(packet_batch).is_err() {
break; break;
} }
} }
@ -88,7 +88,7 @@ impl ShredFetchStage {
fn packet_modifier( fn packet_modifier(
sockets: Vec<Arc<UdpSocket>>, sockets: Vec<Arc<UdpSocket>>,
exit: &Arc<AtomicBool>, exit: &Arc<AtomicBool>,
sender: Sender<Vec<PacketBatch>>, sender: Sender<PacketBatch>,
recycler: PacketBatchRecycler, recycler: PacketBatchRecycler,
bank_forks: Arc<RwLock<BankForks>>, bank_forks: Arc<RwLock<BankForks>>,
shred_version: u16, shred_version: u16,
@ -132,7 +132,7 @@ impl ShredFetchStage {
sockets: Vec<Arc<UdpSocket>>, sockets: Vec<Arc<UdpSocket>>,
forward_sockets: Vec<Arc<UdpSocket>>, forward_sockets: Vec<Arc<UdpSocket>>,
repair_socket: Arc<UdpSocket>, repair_socket: Arc<UdpSocket>,
sender: Sender<Vec<PacketBatch>>, sender: Sender<PacketBatch>,
shred_version: u16, shred_version: u16,
bank_forks: Arc<RwLock<BankForks>>, bank_forks: Arc<RwLock<BankForks>>,
exit: &Arc<AtomicBool>, exit: &Arc<AtomicBool>,

View File

@ -1,11 +1,5 @@
#![allow(clippy::implicit_hasher)]
use { use {
crate::{ crossbeam_channel::{Receiver, RecvTimeoutError, SendError, Sender},
sigverify,
sigverify_stage::{SigVerifier, SigVerifyServiceError},
},
crossbeam_channel::Sender,
solana_ledger::{ solana_ledger::{
leader_schedule_cache::LeaderScheduleCache, shred, sigverify_shreds::verify_shreds_gpu, leader_schedule_cache::LeaderScheduleCache, shred, sigverify_shreds::verify_shreds_gpu,
}, },
@ -18,87 +12,115 @@ use {
atomic::{AtomicBool, Ordering}, atomic::{AtomicBool, Ordering},
Arc, RwLock, Arc, RwLock,
}, },
thread::{Builder, JoinHandle},
time::{Duration, Instant},
}, },
}; };
#[derive(Clone)] #[allow(clippy::enum_variant_names)]
pub struct ShredSigVerifier { enum Error {
pubkey: Pubkey, // TODO: Hot swap will change pubkey. RecvDisconnected,
RecvTimeout,
SendError,
}
pub(crate) fn spawn_shred_sigverify(
// TODO: Hot swap will change pubkey.
self_pubkey: Pubkey,
bank_forks: Arc<RwLock<BankForks>>, bank_forks: Arc<RwLock<BankForks>>,
leader_schedule_cache: Arc<LeaderScheduleCache>, leader_schedule_cache: Arc<LeaderScheduleCache>,
recycler_cache: RecyclerCache, shred_fetch_receiver: Receiver<PacketBatch>,
retransmit_sender: Sender<Vec</*shred:*/ Vec<u8>>>, retransmit_sender: Sender<Vec</*shred:*/ Vec<u8>>>,
packet_sender: Sender<Vec<PacketBatch>>, verified_sender: Sender<Vec<PacketBatch>>,
turbine_disabled: Arc<AtomicBool>, turbine_disabled: Arc<AtomicBool>,
) -> JoinHandle<()> {
let recycler_cache = RecyclerCache::warmed();
let mut stats = ShredSigVerifyStats::new(Instant::now());
Builder::new()
.name("shred-verifier".to_string())
.spawn(move || loop {
match run_shred_sigverify(
&self_pubkey,
&bank_forks,
&leader_schedule_cache,
&recycler_cache,
&shred_fetch_receiver,
&retransmit_sender,
&verified_sender,
&turbine_disabled,
&mut stats,
) {
Ok(()) => (),
Err(Error::RecvTimeout) => (),
Err(Error::RecvDisconnected) => break,
Err(Error::SendError) => break,
}
stats.maybe_submit();
})
.unwrap()
} }
impl ShredSigVerifier { fn run_shred_sigverify(
pub fn new( self_pubkey: &Pubkey,
pubkey: Pubkey, bank_forks: &RwLock<BankForks>,
bank_forks: Arc<RwLock<BankForks>>, leader_schedule_cache: &LeaderScheduleCache,
leader_schedule_cache: Arc<LeaderScheduleCache>, recycler_cache: &RecyclerCache,
retransmit_sender: Sender<Vec</*shred:*/ Vec<u8>>>, shred_fetch_receiver: &Receiver<PacketBatch>,
packet_sender: Sender<Vec<PacketBatch>>, retransmit_sender: &Sender<Vec</*shred:*/ Vec<u8>>>,
turbine_disabled: Arc<AtomicBool>, verified_sender: &Sender<Vec<PacketBatch>>,
) -> Self { turbine_disabled: &AtomicBool,
sigverify::init(); stats: &mut ShredSigVerifyStats,
Self { ) -> Result<(), Error> {
pubkey, const RECV_TIMEOUT: Duration = Duration::from_secs(1);
bank_forks, let packets = shred_fetch_receiver.recv_timeout(RECV_TIMEOUT)?;
leader_schedule_cache, let mut packets: Vec<_> = std::iter::once(packets)
recycler_cache: RecyclerCache::warmed(), .chain(shred_fetch_receiver.try_iter())
retransmit_sender,
packet_sender,
turbine_disabled,
}
}
}
impl SigVerifier for ShredSigVerifier {
type SendType = Vec<PacketBatch>;
fn send_packets(
&mut self,
packet_batches: Vec<PacketBatch>,
) -> Result<(), SigVerifyServiceError<Self::SendType>> {
if self.turbine_disabled.load(Ordering::Relaxed) {
return Ok(());
}
// Exclude repair packets from retransmit.
// TODO: return the error here!
let _ = self.retransmit_sender.send(
packet_batches
.iter()
.flat_map(PacketBatch::iter)
.filter(|packet| !packet.meta.discard() && !packet.meta.repair())
.filter_map(shred::layout::get_shred)
.map(<[u8]>::to_vec)
.collect(),
);
self.packet_sender.send(packet_batches)?;
Ok(())
}
fn verify_batches(
&self,
mut batches: Vec<PacketBatch>,
_valid_packets: usize,
) -> Vec<PacketBatch> {
let working_bank = self.bank_forks.read().unwrap().working_bank();
let leader_slots: HashMap<Slot, [u8; 32]> = get_slot_leaders(
&self.pubkey,
&mut batches,
&self.leader_schedule_cache,
&working_bank,
)
.into_iter()
.filter_map(|(slot, pubkey)| Some((slot, pubkey?.to_bytes())))
.chain(std::iter::once((Slot::MAX, [0u8; 32])))
.collect(); .collect();
let r = verify_shreds_gpu(&batches, &leader_slots, &self.recycler_cache); let now = Instant::now();
solana_perf::sigverify::mark_disabled(&mut batches, &r); stats.num_iters += 1;
batches stats.num_packets += packets.iter().map(PacketBatch::len).sum::<usize>();
stats.num_discards_pre += count_discards(&packets);
verify_packets(
self_pubkey,
bank_forks,
leader_schedule_cache,
recycler_cache,
&mut packets,
);
stats.num_discards_post += count_discards(&packets);
// Exclude repair packets from retransmit.
let shreds: Vec<_> = packets
.iter()
.flat_map(PacketBatch::iter)
.filter(|packet| !packet.meta.discard() && !packet.meta.repair())
.filter_map(shred::layout::get_shred)
.map(<[u8]>::to_vec)
.collect();
stats.num_retransmit_shreds += shreds.len();
if !turbine_disabled.load(Ordering::Relaxed) {
retransmit_sender.send(shreds)?;
verified_sender.send(packets)?;
} }
stats.elapsed_micros += now.elapsed().as_micros() as u64;
Ok(())
}
fn verify_packets(
self_pubkey: &Pubkey,
bank_forks: &RwLock<BankForks>,
leader_schedule_cache: &LeaderScheduleCache,
recycler_cache: &RecyclerCache,
packets: &mut [PacketBatch],
) {
let working_bank = bank_forks.read().unwrap().working_bank();
let leader_slots: HashMap<Slot, [u8; 32]> =
get_slot_leaders(self_pubkey, packets, leader_schedule_cache, &working_bank)
.into_iter()
.filter_map(|(slot, pubkey)| Some((slot, pubkey?.to_bytes())))
.chain(std::iter::once((Slot::MAX, [0u8; 32])))
.collect();
let out = verify_shreds_gpu(packets, &leader_slots, recycler_cache);
solana_perf::sigverify::mark_disabled(packets, &out);
} }
// Returns pubkey of leaders for shred slots refrenced in the packets. // Returns pubkey of leaders for shred slots refrenced in the packets.
@ -139,11 +161,75 @@ fn get_slot_leaders(
leaders leaders
} }
fn count_discards(packets: &[PacketBatch]) -> usize {
packets
.iter()
.flat_map(PacketBatch::iter)
.filter(|packet| packet.meta.discard())
.count()
}
impl From<RecvTimeoutError> for Error {
fn from(err: RecvTimeoutError) -> Self {
match err {
RecvTimeoutError::Timeout => Self::RecvTimeout,
RecvTimeoutError::Disconnected => Self::RecvDisconnected,
}
}
}
impl<T> From<SendError<T>> for Error {
fn from(_: SendError<T>) -> Self {
Self::SendError
}
}
struct ShredSigVerifyStats {
since: Instant,
num_iters: usize,
num_packets: usize,
num_discards_pre: usize,
num_discards_post: usize,
num_retransmit_shreds: usize,
elapsed_micros: u64,
}
impl ShredSigVerifyStats {
const METRICS_SUBMIT_CADENCE: Duration = Duration::from_secs(2);
fn new(now: Instant) -> Self {
Self {
since: now,
num_iters: 0usize,
num_packets: 0usize,
num_discards_pre: 0usize,
num_discards_post: 0usize,
num_retransmit_shreds: 0usize,
elapsed_micros: 0u64,
}
}
fn maybe_submit(&mut self) {
if self.since.elapsed() <= Self::METRICS_SUBMIT_CADENCE {
return;
}
datapoint_info!(
"shred_sigverify",
("num_iters", self.num_iters, i64),
("num_packets", self.num_packets, i64),
("num_discards_pre", self.num_discards_pre, i64),
("num_discards_post", self.num_discards_post, i64),
("num_retransmit_shreds", self.num_retransmit_shreds, i64),
("elapsed_micros", self.elapsed_micros, i64),
);
*self = Self::new(Instant::now());
}
}
#[cfg(test)] #[cfg(test)]
pub mod tests { mod tests {
use { use {
super::*, super::*,
crossbeam_channel::unbounded,
solana_ledger::{ solana_ledger::{
genesis_utils::create_genesis_config_with_leader, genesis_utils::create_genesis_config_with_leader,
shred::{Shred, ShredFlags}, shred::{Shred, ShredFlags},
@ -160,18 +246,8 @@ pub mod tests {
let bank = Bank::new_for_tests( let bank = Bank::new_for_tests(
&create_genesis_config_with_leader(100, &leader_pubkey, 10).genesis_config, &create_genesis_config_with_leader(100, &leader_pubkey, 10).genesis_config,
); );
let cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank)); let leader_schedule_cache = LeaderScheduleCache::new_from_bank(&bank);
let bf = Arc::new(RwLock::new(BankForks::new(bank))); let bank_forks = RwLock::new(BankForks::new(bank));
let (sender, receiver) = unbounded();
let (retransmit_sender, _retransmit_receiver) = unbounded();
let mut verifier = ShredSigVerifier::new(
Pubkey::new_unique(),
bf,
cache,
retransmit_sender,
sender,
Arc::<AtomicBool>::default(), // turbine_disabled
);
let batch_size = 2; let batch_size = 2;
let mut batch = PacketBatch::with_capacity(batch_size); let mut batch = PacketBatch::with_capacity(batch_size);
batch.resize(batch_size, Packet::default()); batch.resize(batch_size, Packet::default());
@ -206,20 +282,14 @@ pub mod tests {
batches[0][1].buffer_mut()[..shred.payload().len()].copy_from_slice(shred.payload()); batches[0][1].buffer_mut()[..shred.payload().len()].copy_from_slice(shred.payload());
batches[0][1].meta.size = shred.payload().len(); batches[0][1].meta.size = shred.payload().len();
let num_packets = solana_perf::sigverify::count_packets_in_batches(&batches); verify_packets(
let rv = verifier.verify_batches(batches, num_packets); &Pubkey::new_unique(), // self_pubkey
assert!(!rv[0][0].meta.discard()); &bank_forks,
assert!(rv[0][1].meta.discard()); &leader_schedule_cache,
&RecyclerCache::warmed(),
verifier.send_packets(rv.clone()).unwrap(); &mut batches,
let received_packets = receiver.recv().unwrap(); );
assert_eq!(received_packets.len(), rv.len()); assert!(!batches[0][0].meta.discard());
for (received_packet_batch, original_packet_batch) in received_packets.iter().zip(rv.iter()) assert!(batches[0][1].meta.discard());
{
assert_eq!(
received_packet_batch.iter().collect::<Vec<_>>(),
original_packet_batch.iter().collect::<Vec<_>>()
);
}
} }
} }

View File

@ -20,8 +20,7 @@ use {
retransmit_stage::RetransmitStage, retransmit_stage::RetransmitStage,
rewards_recorder_service::RewardsRecorderSender, rewards_recorder_service::RewardsRecorderSender,
shred_fetch_stage::ShredFetchStage, shred_fetch_stage::ShredFetchStage,
sigverify_shreds::ShredSigVerifier, sigverify_shreds,
sigverify_stage::SigVerifyStage,
tower_storage::TowerStorage, tower_storage::TowerStorage,
validator::ProcessBlockStore, validator::ProcessBlockStore,
voting_service::VotingService, voting_service::VotingService,
@ -56,13 +55,13 @@ use {
collections::HashSet, collections::HashSet,
net::UdpSocket, net::UdpSocket,
sync::{atomic::AtomicBool, Arc, RwLock}, sync::{atomic::AtomicBool, Arc, RwLock},
thread, thread::{self, JoinHandle},
}, },
}; };
pub struct Tvu { pub struct Tvu {
fetch_stage: ShredFetchStage, fetch_stage: ShredFetchStage,
sigverify_stage: SigVerifyStage, shred_sigverify: JoinHandle<()>,
retransmit_stage: RetransmitStage, retransmit_stage: RetransmitStage,
window_service: WindowService, window_service: WindowService,
cluster_slots_service: ClusterSlotsService, cluster_slots_service: ClusterSlotsService,
@ -163,17 +162,14 @@ impl Tvu {
let (verified_sender, verified_receiver) = unbounded(); let (verified_sender, verified_receiver) = unbounded();
let (retransmit_sender, retransmit_receiver) = unbounded(); let (retransmit_sender, retransmit_receiver) = unbounded();
let sigverify_stage = SigVerifyStage::new( let shred_sigverify = sigverify_shreds::spawn_shred_sigverify(
cluster_info.id(),
bank_forks.clone(),
leader_schedule_cache.clone(),
fetch_receiver, fetch_receiver,
ShredSigVerifier::new( retransmit_sender.clone(),
cluster_info.id(), verified_sender,
bank_forks.clone(), turbine_disabled,
leader_schedule_cache.clone(),
retransmit_sender.clone(),
verified_sender,
turbine_disabled,
),
"shred-verifier",
); );
let retransmit_stage = RetransmitStage::new( let retransmit_stage = RetransmitStage::new(
@ -319,7 +315,7 @@ impl Tvu {
Tvu { Tvu {
fetch_stage, fetch_stage,
sigverify_stage, shred_sigverify,
retransmit_stage, retransmit_stage,
window_service, window_service,
cluster_slots_service, cluster_slots_service,
@ -338,7 +334,7 @@ impl Tvu {
self.window_service.join()?; self.window_service.join()?;
self.cluster_slots_service.join()?; self.cluster_slots_service.join()?;
self.fetch_stage.join()?; self.fetch_stage.join()?;
self.sigverify_stage.join()?; self.shred_sigverify.join()?;
if self.ledger_cleanup_service.is_some() { if self.ledger_cleanup_service.is_some() {
self.ledger_cleanup_service.unwrap().join()?; self.ledger_cleanup_service.unwrap().join()?;
} }