Fix Retransmit slamming the leader with its own blobs (#3938)
This commit is contained in:
parent
69e67d06a7
commit
43f7cd8149
|
@ -646,6 +646,7 @@ impl ClusterInfo {
|
||||||
obj: &Arc<RwLock<Self>>,
|
obj: &Arc<RwLock<Self>>,
|
||||||
peers: &[ContactInfo],
|
peers: &[ContactInfo],
|
||||||
blob: &SharedBlob,
|
blob: &SharedBlob,
|
||||||
|
slot_leader_id: Option<Pubkey>,
|
||||||
s: &UdpSocket,
|
s: &UdpSocket,
|
||||||
forwarded: bool,
|
forwarded: bool,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
|
@ -661,6 +662,7 @@ impl ClusterInfo {
|
||||||
trace!("retransmit orders {}", orders.len());
|
trace!("retransmit orders {}", orders.len());
|
||||||
let errs: Vec<_> = orders
|
let errs: Vec<_> = orders
|
||||||
.par_iter()
|
.par_iter()
|
||||||
|
.filter(|v| v.id != slot_leader_id.unwrap_or_default())
|
||||||
.map(|v| {
|
.map(|v| {
|
||||||
debug!(
|
debug!(
|
||||||
"{}: retransmit blob {} to {} {}",
|
"{}: retransmit blob {} to {} {}",
|
||||||
|
@ -686,19 +688,6 @@ impl ClusterInfo {
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|
||||||
/// retransmit messages from the leader to layer 1 nodes
|
|
||||||
/// # Remarks
|
|
||||||
/// We need to avoid having obj locked while doing any io, such as the `send_to`
|
|
||||||
pub fn retransmit(
|
|
||||||
obj: &Arc<RwLock<Self>>,
|
|
||||||
blob: &SharedBlob,
|
|
||||||
s: &UdpSocket,
|
|
||||||
forwarded: bool,
|
|
||||||
) -> Result<()> {
|
|
||||||
let peers = obj.read().unwrap().retransmit_peers();
|
|
||||||
ClusterInfo::retransmit_to(obj, &peers, blob, s, forwarded)
|
|
||||||
}
|
|
||||||
|
|
||||||
fn send_orders(
|
fn send_orders(
|
||||||
id: &Pubkey,
|
id: &Pubkey,
|
||||||
s: &UdpSocket,
|
s: &UdpSocket,
|
||||||
|
|
|
@ -235,6 +235,7 @@ impl Replicator {
|
||||||
|
|
||||||
let window_service = WindowService::new(
|
let window_service = WindowService::new(
|
||||||
None, //TODO: need a way to validate blobs... https://github.com/solana-labs/solana/issues/3924
|
None, //TODO: need a way to validate blobs... https://github.com/solana-labs/solana/issues/3924
|
||||||
|
None, //TODO: see above ^
|
||||||
blocktree.clone(),
|
blocktree.clone(),
|
||||||
cluster_info.clone(),
|
cluster_info.clone(),
|
||||||
blob_fetch_receiver,
|
blob_fetch_receiver,
|
||||||
|
|
|
@ -5,6 +5,7 @@ use crate::blocktree::Blocktree;
|
||||||
use crate::cluster_info::{
|
use crate::cluster_info::{
|
||||||
compute_retransmit_peers, ClusterInfo, GROW_LAYER_CAPACITY, NEIGHBORHOOD_SIZE,
|
compute_retransmit_peers, ClusterInfo, GROW_LAYER_CAPACITY, NEIGHBORHOOD_SIZE,
|
||||||
};
|
};
|
||||||
|
use crate::leader_schedule_cache::LeaderScheduleCache;
|
||||||
use crate::result::{Error, Result};
|
use crate::result::{Error, Result};
|
||||||
use crate::service::Service;
|
use crate::service::Service;
|
||||||
use crate::staking_utils;
|
use crate::staking_utils;
|
||||||
|
@ -22,19 +23,20 @@ use std::time::Duration;
|
||||||
|
|
||||||
fn retransmit(
|
fn retransmit(
|
||||||
bank_forks: &Arc<RwLock<BankForks>>,
|
bank_forks: &Arc<RwLock<BankForks>>,
|
||||||
|
leader_schedule_cache: &Arc<LeaderScheduleCache>,
|
||||||
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
||||||
r: &BlobReceiver,
|
r: &BlobReceiver,
|
||||||
sock: &UdpSocket,
|
sock: &UdpSocket,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let timer = Duration::new(1, 0);
|
let timer = Duration::new(1, 0);
|
||||||
let mut dq = r.recv_timeout(timer)?;
|
let mut blobs = r.recv_timeout(timer)?;
|
||||||
while let Ok(mut nq) = r.try_recv() {
|
while let Ok(mut nq) = r.try_recv() {
|
||||||
dq.append(&mut nq);
|
blobs.append(&mut nq);
|
||||||
}
|
}
|
||||||
|
|
||||||
submit(
|
submit(
|
||||||
influxdb::Point::new("retransmit-stage")
|
influxdb::Point::new("retransmit-stage")
|
||||||
.add_field("count", influxdb::Value::Integer(dq.len() as i64))
|
.add_field("count", influxdb::Value::Integer(blobs.len() as i64))
|
||||||
.to_owned(),
|
.to_owned(),
|
||||||
);
|
);
|
||||||
let r_bank = bank_forks.read().unwrap().working_bank();
|
let r_bank = bank_forks.read().unwrap().working_bank();
|
||||||
|
@ -46,12 +48,14 @@ fn retransmit(
|
||||||
NEIGHBORHOOD_SIZE,
|
NEIGHBORHOOD_SIZE,
|
||||||
GROW_LAYER_CAPACITY,
|
GROW_LAYER_CAPACITY,
|
||||||
);
|
);
|
||||||
for b in &dq {
|
for blob in &blobs {
|
||||||
if b.read().unwrap().meta.forward {
|
let leader = leader_schedule_cache
|
||||||
ClusterInfo::retransmit_to(&cluster_info, &neighbors, b, sock, true)?;
|
.slot_leader_at_else_compute(blob.read().unwrap().slot(), r_bank.as_ref());
|
||||||
ClusterInfo::retransmit_to(&cluster_info, &children, b, sock, false)?;
|
if blob.read().unwrap().meta.forward {
|
||||||
|
ClusterInfo::retransmit_to(&cluster_info, &neighbors, blob, leader, sock, true)?;
|
||||||
|
ClusterInfo::retransmit_to(&cluster_info, &children, blob, leader, sock, false)?;
|
||||||
} else {
|
} else {
|
||||||
ClusterInfo::retransmit_to(&cluster_info, &children, b, sock, true)?;
|
ClusterInfo::retransmit_to(&cluster_info, &children, blob, leader, sock, true)?;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(())
|
||||||
|
@ -68,16 +72,24 @@ fn retransmit(
|
||||||
fn retransmitter(
|
fn retransmitter(
|
||||||
sock: Arc<UdpSocket>,
|
sock: Arc<UdpSocket>,
|
||||||
bank_forks: Arc<RwLock<BankForks>>,
|
bank_forks: Arc<RwLock<BankForks>>,
|
||||||
|
leader_schedule_cache: &Arc<LeaderScheduleCache>,
|
||||||
cluster_info: Arc<RwLock<ClusterInfo>>,
|
cluster_info: Arc<RwLock<ClusterInfo>>,
|
||||||
r: BlobReceiver,
|
r: BlobReceiver,
|
||||||
) -> JoinHandle<()> {
|
) -> JoinHandle<()> {
|
||||||
let bank_forks = bank_forks.clone();
|
let bank_forks = bank_forks.clone();
|
||||||
|
let leader_schedule_cache = leader_schedule_cache.clone();
|
||||||
Builder::new()
|
Builder::new()
|
||||||
.name("solana-retransmitter".to_string())
|
.name("solana-retransmitter".to_string())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
trace!("retransmitter started");
|
trace!("retransmitter started");
|
||||||
loop {
|
loop {
|
||||||
if let Err(e) = retransmit(&bank_forks, &cluster_info, &r, &sock) {
|
if let Err(e) = retransmit(
|
||||||
|
&bank_forks,
|
||||||
|
&leader_schedule_cache,
|
||||||
|
&cluster_info,
|
||||||
|
&r,
|
||||||
|
&sock,
|
||||||
|
) {
|
||||||
match e {
|
match e {
|
||||||
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => break,
|
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => break,
|
||||||
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
|
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
|
||||||
|
@ -101,6 +113,7 @@ impl RetransmitStage {
|
||||||
#[allow(clippy::new_ret_no_self)]
|
#[allow(clippy::new_ret_no_self)]
|
||||||
pub fn new(
|
pub fn new(
|
||||||
bank_forks: Arc<RwLock<BankForks>>,
|
bank_forks: Arc<RwLock<BankForks>>,
|
||||||
|
leader_schedule_cache: &Arc<LeaderScheduleCache>,
|
||||||
blocktree: Arc<Blocktree>,
|
blocktree: Arc<Blocktree>,
|
||||||
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
||||||
retransmit_socket: Arc<UdpSocket>,
|
retransmit_socket: Arc<UdpSocket>,
|
||||||
|
@ -113,11 +126,13 @@ impl RetransmitStage {
|
||||||
let t_retransmit = retransmitter(
|
let t_retransmit = retransmitter(
|
||||||
retransmit_socket,
|
retransmit_socket,
|
||||||
bank_forks.clone(),
|
bank_forks.clone(),
|
||||||
|
leader_schedule_cache,
|
||||||
cluster_info.clone(),
|
cluster_info.clone(),
|
||||||
retransmit_receiver,
|
retransmit_receiver,
|
||||||
);
|
);
|
||||||
let window_service = WindowService::new(
|
let window_service = WindowService::new(
|
||||||
Some(bank_forks),
|
Some(bank_forks),
|
||||||
|
Some(leader_schedule_cache.clone()),
|
||||||
blocktree,
|
blocktree,
|
||||||
cluster_info.clone(),
|
cluster_info.clone(),
|
||||||
fetch_stage_receiver,
|
fetch_stage_receiver,
|
||||||
|
|
|
@ -103,6 +103,7 @@ impl Tvu {
|
||||||
//then sent to the window, which does the erasure coding reconstruction
|
//then sent to the window, which does the erasure coding reconstruction
|
||||||
let retransmit_stage = RetransmitStage::new(
|
let retransmit_stage = RetransmitStage::new(
|
||||||
bank_forks.clone(),
|
bank_forks.clone(),
|
||||||
|
leader_schedule_cache,
|
||||||
blocktree.clone(),
|
blocktree.clone(),
|
||||||
&cluster_info,
|
&cluster_info,
|
||||||
Arc::new(retransmit_socket),
|
Arc::new(retransmit_socket),
|
||||||
|
|
|
@ -4,6 +4,7 @@
|
||||||
use crate::bank_forks::BankForks;
|
use crate::bank_forks::BankForks;
|
||||||
use crate::blocktree::Blocktree;
|
use crate::blocktree::Blocktree;
|
||||||
use crate::cluster_info::ClusterInfo;
|
use crate::cluster_info::ClusterInfo;
|
||||||
|
use crate::leader_schedule_cache::LeaderScheduleCache;
|
||||||
use crate::leader_schedule_utils::slot_leader_at;
|
use crate::leader_schedule_utils::slot_leader_at;
|
||||||
use crate::packet::{Blob, SharedBlob, BLOB_HEADER_SIZE};
|
use crate::packet::{Blob, SharedBlob, BLOB_HEADER_SIZE};
|
||||||
use crate::repair_service::{RepairService, RepairSlotRange};
|
use crate::repair_service::{RepairService, RepairSlotRange};
|
||||||
|
@ -71,8 +72,19 @@ fn process_blobs(blobs: &[SharedBlob], blocktree: &Arc<Blocktree>) -> Result<()>
|
||||||
|
|
||||||
/// drop blobs that are from myself or not from the correct leader for the
|
/// drop blobs that are from myself or not from the correct leader for the
|
||||||
/// blob's slot
|
/// blob's slot
|
||||||
fn should_retransmit_and_persist(blob: &Blob, bank: Option<&Arc<Bank>>, my_id: &Pubkey) -> bool {
|
fn should_retransmit_and_persist(
|
||||||
let slot_leader_id = bank.and_then(|bank| slot_leader_at(blob.slot(), &bank));
|
blob: &Blob,
|
||||||
|
bank: Option<&Arc<Bank>>,
|
||||||
|
leader_schedule_cache: Option<&Arc<LeaderScheduleCache>>,
|
||||||
|
my_id: &Pubkey,
|
||||||
|
) -> bool {
|
||||||
|
let slot_leader_id = match bank {
|
||||||
|
None => leader_schedule_cache.and_then(|cache| cache.slot_leader_at(blob.slot())),
|
||||||
|
Some(bank) => match leader_schedule_cache {
|
||||||
|
None => slot_leader_at(blob.slot(), &bank),
|
||||||
|
Some(cache) => cache.slot_leader_at_else_compute(blob.slot(), bank),
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
if blob.id() == *my_id {
|
if blob.id() == *my_id {
|
||||||
inc_new_counter_info!("streamer-recv_window-circular_transmission", 1);
|
inc_new_counter_info!("streamer-recv_window-circular_transmission", 1);
|
||||||
|
@ -90,6 +102,7 @@ fn should_retransmit_and_persist(blob: &Blob, bank: Option<&Arc<Bank>>, my_id: &
|
||||||
|
|
||||||
fn recv_window(
|
fn recv_window(
|
||||||
bank_forks: Option<&Arc<RwLock<BankForks>>>,
|
bank_forks: Option<&Arc<RwLock<BankForks>>>,
|
||||||
|
leader_schedule_cache: Option<&Arc<LeaderScheduleCache>>,
|
||||||
blocktree: &Arc<Blocktree>,
|
blocktree: &Arc<Blocktree>,
|
||||||
my_id: &Pubkey,
|
my_id: &Pubkey,
|
||||||
r: &BlobReceiver,
|
r: &BlobReceiver,
|
||||||
|
@ -110,6 +123,7 @@ fn recv_window(
|
||||||
bank_forks
|
bank_forks
|
||||||
.map(|bank_forks| bank_forks.read().unwrap().working_bank())
|
.map(|bank_forks| bank_forks.read().unwrap().working_bank())
|
||||||
.as_ref(),
|
.as_ref(),
|
||||||
|
leader_schedule_cache,
|
||||||
my_id,
|
my_id,
|
||||||
)
|
)
|
||||||
});
|
});
|
||||||
|
@ -154,6 +168,7 @@ pub struct WindowService {
|
||||||
impl WindowService {
|
impl WindowService {
|
||||||
pub fn new(
|
pub fn new(
|
||||||
bank_forks: Option<Arc<RwLock<BankForks>>>,
|
bank_forks: Option<Arc<RwLock<BankForks>>>,
|
||||||
|
leader_schedule_cache: Option<Arc<LeaderScheduleCache>>,
|
||||||
blocktree: Arc<Blocktree>,
|
blocktree: Arc<Blocktree>,
|
||||||
cluster_info: Arc<RwLock<ClusterInfo>>,
|
cluster_info: Arc<RwLock<ClusterInfo>>,
|
||||||
r: BlobReceiver,
|
r: BlobReceiver,
|
||||||
|
@ -171,6 +186,7 @@ impl WindowService {
|
||||||
);
|
);
|
||||||
let exit = exit.clone();
|
let exit = exit.clone();
|
||||||
let bank_forks = bank_forks.clone();
|
let bank_forks = bank_forks.clone();
|
||||||
|
let leader_schedule_cache = leader_schedule_cache.clone();
|
||||||
let t_window = Builder::new()
|
let t_window = Builder::new()
|
||||||
.name("solana-window".to_string())
|
.name("solana-window".to_string())
|
||||||
.spawn(move || {
|
.spawn(move || {
|
||||||
|
@ -181,9 +197,14 @@ impl WindowService {
|
||||||
if exit.load(Ordering::Relaxed) {
|
if exit.load(Ordering::Relaxed) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
if let Err(e) =
|
if let Err(e) = recv_window(
|
||||||
recv_window(bank_forks.as_ref(), &blocktree, &id, &r, &retransmit)
|
bank_forks.as_ref(),
|
||||||
{
|
leader_schedule_cache.as_ref(),
|
||||||
|
&blocktree,
|
||||||
|
&id,
|
||||||
|
&r,
|
||||||
|
&retransmit,
|
||||||
|
) {
|
||||||
match e {
|
match e {
|
||||||
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => break,
|
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => break,
|
||||||
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
|
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
|
||||||
|
@ -263,23 +284,27 @@ mod test {
|
||||||
let bank = Arc::new(Bank::new(
|
let bank = Arc::new(Bank::new(
|
||||||
&GenesisBlock::new_with_leader(100, &leader_id, 10).0,
|
&GenesisBlock::new_with_leader(100, &leader_id, 10).0,
|
||||||
));
|
));
|
||||||
|
let cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank));
|
||||||
|
|
||||||
let mut blob = Blob::default();
|
let mut blob = Blob::default();
|
||||||
blob.set_id(&leader_id);
|
blob.set_id(&leader_id);
|
||||||
|
|
||||||
// without a Bank and blobs not from me, blob continues
|
// without a Bank and blobs not from me, blob continues
|
||||||
assert_eq!(should_retransmit_and_persist(&blob, None, &me_id), true);
|
assert_eq!(
|
||||||
|
should_retransmit_and_persist(&blob, None, None, &me_id),
|
||||||
|
true
|
||||||
|
);
|
||||||
|
|
||||||
// with a Bank for slot 0, blob continues
|
// with a Bank for slot 0, blob continues
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
should_retransmit_and_persist(&blob, Some(&bank), &me_id),
|
should_retransmit_and_persist(&blob, Some(&bank), Some(&cache), &me_id),
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
|
|
||||||
// set the blob to have come from the wrong leader
|
// set the blob to have come from the wrong leader
|
||||||
blob.set_id(&Pubkey::new_rand());
|
blob.set_id(&Pubkey::new_rand());
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
should_retransmit_and_persist(&blob, Some(&bank), &me_id),
|
should_retransmit_and_persist(&blob, Some(&bank), Some(&cache), &me_id),
|
||||||
false
|
false
|
||||||
);
|
);
|
||||||
|
|
||||||
|
@ -287,13 +312,16 @@ mod test {
|
||||||
// TODO: persistr in blocktree that we didn't know who the leader was at the time?
|
// TODO: persistr in blocktree that we didn't know who the leader was at the time?
|
||||||
blob.set_slot(100);
|
blob.set_slot(100);
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
should_retransmit_and_persist(&blob, Some(&bank), &me_id),
|
should_retransmit_and_persist(&blob, Some(&bank), Some(&cache), &me_id),
|
||||||
true
|
true
|
||||||
);
|
);
|
||||||
|
|
||||||
// if the blob came back from me, it doesn't continue, whether or not I have a bank
|
// if the blob came back from me, it doesn't continue, whether or not I have a bank
|
||||||
blob.set_id(&me_id);
|
blob.set_id(&me_id);
|
||||||
assert_eq!(should_retransmit_and_persist(&blob, None, &me_id), false);
|
assert_eq!(
|
||||||
|
should_retransmit_and_persist(&blob, None, None, &me_id),
|
||||||
|
false
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
@ -315,11 +343,13 @@ mod test {
|
||||||
let blocktree = Arc::new(
|
let blocktree = Arc::new(
|
||||||
Blocktree::open(&blocktree_path).expect("Expected to be able to open database ledger"),
|
Blocktree::open(&blocktree_path).expect("Expected to be able to open database ledger"),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let bank = Bank::new(&GenesisBlock::new_with_leader(100, &me_id, 10).0);
|
||||||
|
let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank));
|
||||||
|
let bank_forks = Some(Arc::new(RwLock::new(BankForks::new(0, bank))));
|
||||||
let t_window = WindowService::new(
|
let t_window = WindowService::new(
|
||||||
Some(Arc::new(RwLock::new(BankForks::new(
|
bank_forks,
|
||||||
0,
|
Some(leader_schedule_cache),
|
||||||
Bank::new(&GenesisBlock::new_with_leader(100, &me_id, 10).0),
|
|
||||||
)))),
|
|
||||||
blocktree,
|
blocktree,
|
||||||
subs,
|
subs,
|
||||||
r_reader,
|
r_reader,
|
||||||
|
@ -391,11 +421,12 @@ mod test {
|
||||||
let blocktree = Arc::new(
|
let blocktree = Arc::new(
|
||||||
Blocktree::open(&blocktree_path).expect("Expected to be able to open database ledger"),
|
Blocktree::open(&blocktree_path).expect("Expected to be able to open database ledger"),
|
||||||
);
|
);
|
||||||
|
let bank = Bank::new(&GenesisBlock::new_with_leader(100, &me_id, 10).0);
|
||||||
|
let leader_schedule_cache = Arc::new(LeaderScheduleCache::new_from_bank(&bank));
|
||||||
|
let bank_forks = Some(Arc::new(RwLock::new(BankForks::new(0, bank))));
|
||||||
let t_window = WindowService::new(
|
let t_window = WindowService::new(
|
||||||
Some(Arc::new(RwLock::new(BankForks::new(
|
bank_forks,
|
||||||
0,
|
Some(leader_schedule_cache),
|
||||||
Bank::new(&GenesisBlock::new_with_leader(100, &me_id, 10).0),
|
|
||||||
)))),
|
|
||||||
blocktree,
|
blocktree,
|
||||||
subs.clone(),
|
subs.clone(),
|
||||||
r_reader,
|
r_reader,
|
||||||
|
|
|
@ -176,7 +176,8 @@ pub fn cluster_info_retransmit() -> result::Result<()> {
|
||||||
assert!(done);
|
assert!(done);
|
||||||
let b = SharedBlob::default();
|
let b = SharedBlob::default();
|
||||||
b.write().unwrap().meta.size = 10;
|
b.write().unwrap().meta.size = 10;
|
||||||
ClusterInfo::retransmit(&c1, &b, &tn1, false)?;
|
let peers = c1.read().unwrap().retransmit_peers();
|
||||||
|
ClusterInfo::retransmit_to(&c1, &peers, &b, None, &tn1, false)?;
|
||||||
let res: Vec<_> = [tn1, tn2, tn3]
|
let res: Vec<_> = [tn1, tn2, tn3]
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map(|s| {
|
.map(|s| {
|
||||||
|
|
Loading…
Reference in New Issue