adds fallback logic if retransmit multicast fails (#17714)

In retransmit-stage, based on the packet.meta.seed and resulting
children/neighbors, each packet is sent to a different set of peers:
https://github.com/solana-labs/solana/blob/708bbcb00/core/src/retransmit_stage.rs#L421-L457

However, current code errors out as soon as a multicast call fails,
which will skip all the remaining packets:
https://github.com/solana-labs/solana/blob/708bbcb00/core/src/retransmit_stage.rs#L467-L470

This can exacerbate packets loss in turbine.

This commit:
  * keeps iterating over retransmit packets for loop even if some
    intermediate sends fail.
  * adds a fallback to UdpSocket::send_to if multicast fails.

Recent discord chat:
https://discord.com/channels/428295358100013066/689412830075551748/849530845052403733
This commit is contained in:
behzad nouri 2021-06-04 12:16:37 +00:00 committed by GitHub
parent bea7ce717c
commit be957f25c9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 23 additions and 22 deletions

View File

@ -465,9 +465,9 @@ fn retransmit(
let mut retransmit_time = Measure::start("retransmit_to"); let mut retransmit_time = Measure::start("retransmit_to");
if !packet.meta.forward { if !packet.meta.forward {
ClusterInfo::retransmit_to(&neighbors, packet, sock, true)?; ClusterInfo::retransmit_to(&neighbors, packet, sock, true);
} }
ClusterInfo::retransmit_to(&children, packet, sock, packet.meta.forward)?; ClusterInfo::retransmit_to(&children, packet, sock, packet.meta.forward);
retransmit_time.stop(); retransmit_time.stop();
retransmit_total += retransmit_time.as_us(); retransmit_total += retransmit_time.as_us();
} }

View File

@ -1382,12 +1382,7 @@ impl ClusterInfo {
/// retransmit messages to a list of nodes /// retransmit messages to a list of nodes
/// # Remarks /// # Remarks
/// We need to avoid having obj locked while doing a io, such as the `send_to` /// We need to avoid having obj locked while doing a io, such as the `send_to`
pub fn retransmit_to( pub fn retransmit_to(peers: &[&ContactInfo], packet: &Packet, s: &UdpSocket, forwarded: bool) {
peers: &[&ContactInfo],
packet: &Packet,
s: &UdpSocket,
forwarded: bool,
) -> Result<(), GossipError> {
trace!("retransmit orders {}", peers.len()); trace!("retransmit orders {}", peers.len());
let dests: Vec<_> = if forwarded { let dests: Vec<_> = if forwarded {
peers peers
@ -1398,22 +1393,28 @@ impl ClusterInfo {
} else { } else {
peers.iter().map(|peer| &peer.tvu).collect() peers.iter().map(|peer| &peer.tvu).collect()
}; };
let mut sent = 0; let mut dests = &dests[..];
while sent < dests.len() { let data = &packet.data[..packet.meta.size];
match multicast(s, &packet.data[..packet.meta.size], &dests[sent..]) { while !dests.is_empty() {
Ok(n) => sent += n, match multicast(s, data, dests) {
Err(e) => { Ok(n) => dests = &dests[n..],
inc_new_counter_error!( Err(err) => {
"cluster_info-retransmit-send_to_error", inc_new_counter_error!("cluster_info-retransmit-send_to_error", dests.len(), 1);
dests.len() - sent, error!("retransmit multicast: {:?}", err);
1 break;
);
error!("retransmit result {:?}", e);
return Err(GossipError::Io(e));
} }
} }
} }
Ok(()) let mut errs = 0;
for dest in dests {
if let Err(err) = s.send_to(data, dest) {
error!("retransmit send: {}, {:?}", dest, err);
errs += 1;
}
}
if errs != 0 {
inc_new_counter_error!("cluster_info-retransmit-error", errs, 1);
}
} }
fn insert_self(&self) { fn insert_self(&self) {

View File

@ -209,7 +209,7 @@ pub fn cluster_info_retransmit() {
p.meta.size = 10; p.meta.size = 10;
let peers = c1.tvu_peers(); let peers = c1.tvu_peers();
let retransmit_peers: Vec<_> = peers.iter().collect(); let retransmit_peers: Vec<_> = peers.iter().collect();
ClusterInfo::retransmit_to(&retransmit_peers, &p, &tn1, false).unwrap(); ClusterInfo::retransmit_to(&retransmit_peers, &p, &tn1, false);
let res: Vec<_> = [tn1, tn2, tn3] let res: Vec<_> = [tn1, tn2, tn3]
.into_par_iter() .into_par_iter()
.map(|s| { .map(|s| {