adds fallback logic if retransmit multicast fails (#17714)
In retransmit-stage, based on the packet.meta.seed and resulting children/neighbors, each packet is sent to a different set of peers: https://github.com/solana-labs/solana/blob/708bbcb00/core/src/retransmit_stage.rs#L421-L457 However, current code errors out as soon as a multicast call fails, which will skip all the remaining packets: https://github.com/solana-labs/solana/blob/708bbcb00/core/src/retransmit_stage.rs#L467-L470 This can exacerbate packets loss in turbine. This commit: * keeps iterating over retransmit packets for loop even if some intermediate sends fail. * adds a fallback to UdpSocket::send_to if multicast fails. Recent discord chat: https://discord.com/channels/428295358100013066/689412830075551748/849530845052403733
This commit is contained in:
parent
bea7ce717c
commit
be957f25c9
|
@ -465,9 +465,9 @@ fn retransmit(
|
||||||
|
|
||||||
let mut retransmit_time = Measure::start("retransmit_to");
|
let mut retransmit_time = Measure::start("retransmit_to");
|
||||||
if !packet.meta.forward {
|
if !packet.meta.forward {
|
||||||
ClusterInfo::retransmit_to(&neighbors, packet, sock, true)?;
|
ClusterInfo::retransmit_to(&neighbors, packet, sock, true);
|
||||||
}
|
}
|
||||||
ClusterInfo::retransmit_to(&children, packet, sock, packet.meta.forward)?;
|
ClusterInfo::retransmit_to(&children, packet, sock, packet.meta.forward);
|
||||||
retransmit_time.stop();
|
retransmit_time.stop();
|
||||||
retransmit_total += retransmit_time.as_us();
|
retransmit_total += retransmit_time.as_us();
|
||||||
}
|
}
|
||||||
|
|
|
@ -1382,12 +1382,7 @@ impl ClusterInfo {
|
||||||
/// retransmit messages to a list of nodes
|
/// retransmit messages to a list of nodes
|
||||||
/// # Remarks
|
/// # Remarks
|
||||||
/// We need to avoid having obj locked while doing a io, such as the `send_to`
|
/// We need to avoid having obj locked while doing a io, such as the `send_to`
|
||||||
pub fn retransmit_to(
|
pub fn retransmit_to(peers: &[&ContactInfo], packet: &Packet, s: &UdpSocket, forwarded: bool) {
|
||||||
peers: &[&ContactInfo],
|
|
||||||
packet: &Packet,
|
|
||||||
s: &UdpSocket,
|
|
||||||
forwarded: bool,
|
|
||||||
) -> Result<(), GossipError> {
|
|
||||||
trace!("retransmit orders {}", peers.len());
|
trace!("retransmit orders {}", peers.len());
|
||||||
let dests: Vec<_> = if forwarded {
|
let dests: Vec<_> = if forwarded {
|
||||||
peers
|
peers
|
||||||
|
@ -1398,22 +1393,28 @@ impl ClusterInfo {
|
||||||
} else {
|
} else {
|
||||||
peers.iter().map(|peer| &peer.tvu).collect()
|
peers.iter().map(|peer| &peer.tvu).collect()
|
||||||
};
|
};
|
||||||
let mut sent = 0;
|
let mut dests = &dests[..];
|
||||||
while sent < dests.len() {
|
let data = &packet.data[..packet.meta.size];
|
||||||
match multicast(s, &packet.data[..packet.meta.size], &dests[sent..]) {
|
while !dests.is_empty() {
|
||||||
Ok(n) => sent += n,
|
match multicast(s, data, dests) {
|
||||||
Err(e) => {
|
Ok(n) => dests = &dests[n..],
|
||||||
inc_new_counter_error!(
|
Err(err) => {
|
||||||
"cluster_info-retransmit-send_to_error",
|
inc_new_counter_error!("cluster_info-retransmit-send_to_error", dests.len(), 1);
|
||||||
dests.len() - sent,
|
error!("retransmit multicast: {:?}", err);
|
||||||
1
|
break;
|
||||||
);
|
|
||||||
error!("retransmit result {:?}", e);
|
|
||||||
return Err(GossipError::Io(e));
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
let mut errs = 0;
|
||||||
|
for dest in dests {
|
||||||
|
if let Err(err) = s.send_to(data, dest) {
|
||||||
|
error!("retransmit send: {}, {:?}", dest, err);
|
||||||
|
errs += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if errs != 0 {
|
||||||
|
inc_new_counter_error!("cluster_info-retransmit-error", errs, 1);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn insert_self(&self) {
|
fn insert_self(&self) {
|
||||||
|
|
|
@ -209,7 +209,7 @@ pub fn cluster_info_retransmit() {
|
||||||
p.meta.size = 10;
|
p.meta.size = 10;
|
||||||
let peers = c1.tvu_peers();
|
let peers = c1.tvu_peers();
|
||||||
let retransmit_peers: Vec<_> = peers.iter().collect();
|
let retransmit_peers: Vec<_> = peers.iter().collect();
|
||||||
ClusterInfo::retransmit_to(&retransmit_peers, &p, &tn1, false).unwrap();
|
ClusterInfo::retransmit_to(&retransmit_peers, &p, &tn1, false);
|
||||||
let res: Vec<_> = [tn1, tn2, tn3]
|
let res: Vec<_> = [tn1, tn2, tn3]
|
||||||
.into_par_iter()
|
.into_par_iter()
|
||||||
.map(|s| {
|
.map(|s| {
|
||||||
|
|
Loading…
Reference in New Issue