2018-08-09 12:03:34 -07:00
|
|
|
//! The `retransmit_stage` retransmits blobs between validators
|
2018-06-13 21:52:23 -07:00
|
|
|
|
2019-01-02 00:46:15 -08:00
|
|
|
use crate::bank::Bank;
|
|
|
|
use crate::cluster_info::{ClusterInfo, DATA_PLANE_FANOUT, GROW_LAYER_CAPACITY, NEIGHBORHOOD_SIZE};
|
2018-12-07 19:16:27 -08:00
|
|
|
use crate::counter::Counter;
|
|
|
|
use crate::db_ledger::DbLedger;
|
|
|
|
use crate::entry::Entry;
|
|
|
|
use crate::leader_scheduler::LeaderScheduler;
|
|
|
|
use crate::result::{Error, Result};
|
|
|
|
use crate::service::Service;
|
|
|
|
use crate::streamer::BlobReceiver;
|
|
|
|
use crate::window_service::window_service;
|
2018-08-09 13:41:21 -07:00
|
|
|
use log::Level;
|
2018-11-16 08:45:59 -08:00
|
|
|
use solana_metrics::{influxdb, submit};
|
2018-06-13 21:52:23 -07:00
|
|
|
use std::net::UdpSocket;
|
2018-09-24 14:10:51 -07:00
|
|
|
use std::sync::atomic::{AtomicBool, AtomicUsize};
|
2018-08-09 13:41:21 -07:00
|
|
|
use std::sync::mpsc::RecvTimeoutError;
|
2018-09-21 16:01:24 -07:00
|
|
|
use std::sync::mpsc::{channel, Receiver};
|
2018-06-13 21:52:23 -07:00
|
|
|
use std::sync::{Arc, RwLock};
|
2018-08-09 14:17:50 -07:00
|
|
|
use std::thread::{self, Builder, JoinHandle};
|
2018-08-09 13:41:21 -07:00
|
|
|
use std::time::Duration;
|
2018-06-13 21:52:23 -07:00
|
|
|
|
2018-10-08 19:55:54 -07:00
|
|
|
fn retransmit(
|
2019-01-02 00:46:15 -08:00
|
|
|
bank: &Arc<Bank>,
|
2018-10-08 19:55:54 -07:00
|
|
|
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
|
|
|
r: &BlobReceiver,
|
|
|
|
sock: &UdpSocket,
|
|
|
|
) -> Result<()> {
|
2018-08-09 13:41:21 -07:00
|
|
|
let timer = Duration::new(1, 0);
|
|
|
|
let mut dq = r.recv_timeout(timer)?;
|
|
|
|
while let Ok(mut nq) = r.try_recv() {
|
|
|
|
dq.append(&mut nq);
|
|
|
|
}
|
2018-10-16 12:54:23 -07:00
|
|
|
|
2018-11-16 08:45:59 -08:00
|
|
|
submit(
|
2018-10-16 12:54:23 -07:00
|
|
|
influxdb::Point::new("retransmit-stage")
|
2018-10-20 06:38:20 -07:00
|
|
|
.add_field("count", influxdb::Value::Integer(dq.len() as i64))
|
2018-10-16 12:54:23 -07:00
|
|
|
.to_owned(),
|
|
|
|
);
|
|
|
|
|
2019-01-02 00:46:15 -08:00
|
|
|
// TODO layer 2 logic here
|
|
|
|
// 1 - find out if I am in layer 1 first
|
|
|
|
// 1.1 - If yes, then broadcast to all layer 1 nodes
|
|
|
|
// 1 - using my layer 1 index, broadcast to all layer 2 nodes assuming you know neighborhood size
|
|
|
|
// 1.2 - If no, then figure out what layer I am in and who my neighbors are and only broadcast to them
|
|
|
|
// 1 - also check if there are nodes in lower layers and repeat the layer 1 to layer 2 logic
|
|
|
|
let peers = cluster_info.read().unwrap().sorted_retransmit_peers(bank);
|
|
|
|
let my_id = cluster_info.read().unwrap().id();
|
|
|
|
//calc num_layers and num_neighborhoods using the total number of nodes
|
|
|
|
let (num_layers, layer_indices) = ClusterInfo::describe_data_plane(
|
|
|
|
peers.len(),
|
|
|
|
DATA_PLANE_FANOUT,
|
|
|
|
NEIGHBORHOOD_SIZE,
|
|
|
|
GROW_LAYER_CAPACITY,
|
|
|
|
);
|
|
|
|
if num_layers <= 1 {
|
|
|
|
/* single layer data plane */
|
|
|
|
for b in &mut dq {
|
|
|
|
ClusterInfo::retransmit(&cluster_info, b, sock)?;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
//find my index (my ix is the same as the first node with smaller stake)
|
|
|
|
let my_index = peers
|
|
|
|
.iter()
|
2019-01-23 15:25:54 -08:00
|
|
|
.position(|ci| bank.get_balance(&ci.id) <= bank.get_balance(&my_id));
|
2019-01-02 00:46:15 -08:00
|
|
|
//find my layer
|
|
|
|
let locality = ClusterInfo::localize(
|
|
|
|
&layer_indices,
|
|
|
|
NEIGHBORHOOD_SIZE,
|
|
|
|
my_index.unwrap_or(peers.len() - 1),
|
|
|
|
);
|
|
|
|
let mut retransmit_peers =
|
|
|
|
peers[locality.neighbor_bounds.0..locality.neighbor_bounds.1].to_vec();
|
|
|
|
locality.child_layer_peers.iter().for_each(|&ix| {
|
|
|
|
if let Some(peer) = peers.get(ix) {
|
|
|
|
retransmit_peers.push(peer.clone());
|
|
|
|
}
|
|
|
|
});
|
|
|
|
|
|
|
|
for b in &mut dq {
|
|
|
|
ClusterInfo::retransmit_to(&cluster_info, &retransmit_peers, b, sock)?;
|
|
|
|
}
|
2018-08-09 13:41:21 -07:00
|
|
|
}
|
|
|
|
Ok(())
|
|
|
|
}
|
|
|
|
|
2019-01-02 00:46:15 -08:00
|
|
|
/// Service to retransmit messages from the leader or layer 1 to relevant peer nodes.
|
2018-10-08 19:55:54 -07:00
|
|
|
/// See `cluster_info` for network layer definitions.
|
2018-08-09 13:41:21 -07:00
|
|
|
/// # Arguments
|
|
|
|
/// * `sock` - Socket to read from. Read timeout is set to 1.
|
|
|
|
/// * `exit` - Boolean to signal system exit.
|
2018-10-08 19:55:54 -07:00
|
|
|
/// * `cluster_info` - This structure needs to be updated and populated by the bank and via gossip.
|
2018-08-09 13:41:21 -07:00
|
|
|
/// * `recycler` - Blob recycler.
|
|
|
|
/// * `r` - Receive channel for blobs to be retransmitted to all the layer 1 nodes.
|
2018-10-08 19:55:54 -07:00
|
|
|
fn retransmitter(
|
|
|
|
sock: Arc<UdpSocket>,
|
2019-01-02 00:46:15 -08:00
|
|
|
bank: Arc<Bank>,
|
2018-10-08 19:55:54 -07:00
|
|
|
cluster_info: Arc<RwLock<ClusterInfo>>,
|
|
|
|
r: BlobReceiver,
|
|
|
|
) -> JoinHandle<()> {
|
2018-08-09 13:41:21 -07:00
|
|
|
Builder::new()
|
|
|
|
.name("solana-retransmitter".to_string())
|
|
|
|
.spawn(move || {
|
|
|
|
trace!("retransmitter started");
|
|
|
|
loop {
|
2019-01-02 00:46:15 -08:00
|
|
|
if let Err(e) = retransmit(&bank, &cluster_info, &r, &sock) {
|
2018-08-09 13:41:21 -07:00
|
|
|
match e {
|
|
|
|
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => break,
|
|
|
|
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
|
|
|
|
_ => {
|
|
|
|
inc_new_counter_info!("streamer-retransmit-error", 1, 1);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
trace!("exiting retransmitter");
|
2018-12-07 19:01:28 -08:00
|
|
|
})
|
|
|
|
.unwrap()
|
2018-08-09 13:41:21 -07:00
|
|
|
}
|
|
|
|
|
2018-08-09 12:03:34 -07:00
|
|
|
pub struct RetransmitStage {
|
2018-10-10 16:49:41 -07:00
|
|
|
thread_hdls: Vec<JoinHandle<()>>,
|
2018-06-13 21:52:23 -07:00
|
|
|
}
|
|
|
|
|
2018-08-09 12:03:34 -07:00
|
|
|
impl RetransmitStage {
|
2019-01-31 13:43:22 -08:00
|
|
|
#[allow(clippy::new_ret_no_self, clippy::too_many_arguments)]
|
2018-06-13 21:52:23 -07:00
|
|
|
pub fn new(
|
2019-01-02 00:46:15 -08:00
|
|
|
bank: &Arc<Bank>,
|
2018-12-18 15:18:57 -08:00
|
|
|
db_ledger: Arc<DbLedger>,
|
2018-10-08 19:55:54 -07:00
|
|
|
cluster_info: &Arc<RwLock<ClusterInfo>>,
|
2018-10-18 22:57:48 -07:00
|
|
|
tick_height: u64,
|
2018-06-27 12:35:58 -07:00
|
|
|
entry_height: u64,
|
2018-08-28 16:32:40 -07:00
|
|
|
retransmit_socket: Arc<UdpSocket>,
|
2018-08-30 12:07:54 -07:00
|
|
|
repair_socket: Arc<UdpSocket>,
|
2018-06-27 11:33:56 -07:00
|
|
|
fetch_stage_receiver: BlobReceiver,
|
2018-10-10 16:49:41 -07:00
|
|
|
leader_scheduler: Arc<RwLock<LeaderScheduler>>,
|
2019-01-31 13:43:22 -08:00
|
|
|
exit: Arc<AtomicBool>,
|
2018-09-21 16:01:24 -07:00
|
|
|
) -> (Self, Receiver<Vec<Entry>>) {
|
2018-06-13 21:52:23 -07:00
|
|
|
let (retransmit_sender, retransmit_receiver) = channel();
|
|
|
|
|
2019-01-02 00:46:15 -08:00
|
|
|
let t_retransmit = retransmitter(
|
|
|
|
retransmit_socket,
|
|
|
|
bank.clone(),
|
|
|
|
cluster_info.clone(),
|
|
|
|
retransmit_receiver,
|
|
|
|
);
|
2018-09-21 16:01:24 -07:00
|
|
|
let (entry_sender, entry_receiver) = channel();
|
2018-09-24 14:10:51 -07:00
|
|
|
let done = Arc::new(AtomicBool::new(false));
|
2018-09-07 15:00:26 -07:00
|
|
|
let t_window = window_service(
|
2018-11-24 19:32:33 -08:00
|
|
|
db_ledger,
|
2018-10-08 19:55:54 -07:00
|
|
|
cluster_info.clone(),
|
2018-10-18 22:57:48 -07:00
|
|
|
tick_height,
|
2018-06-27 12:35:58 -07:00
|
|
|
entry_height,
|
2018-09-24 14:10:51 -07:00
|
|
|
0,
|
2018-06-13 21:52:23 -07:00
|
|
|
fetch_stage_receiver,
|
2019-01-09 14:58:52 -08:00
|
|
|
Some(entry_sender),
|
2018-06-13 21:52:23 -07:00
|
|
|
retransmit_sender,
|
2018-08-30 12:07:54 -07:00
|
|
|
repair_socket,
|
2018-10-10 16:49:41 -07:00
|
|
|
leader_scheduler,
|
2018-09-24 14:10:51 -07:00
|
|
|
done,
|
2019-01-31 13:43:22 -08:00
|
|
|
exit,
|
2018-06-13 21:52:23 -07:00
|
|
|
);
|
|
|
|
|
2018-10-10 16:49:41 -07:00
|
|
|
let thread_hdls = vec![t_retransmit, t_window];
|
2018-12-07 19:01:28 -08:00
|
|
|
(Self { thread_hdls }, entry_receiver)
|
2018-06-13 21:52:23 -07:00
|
|
|
}
|
|
|
|
}
|
2018-07-03 21:14:08 -07:00
|
|
|
|
2018-08-09 12:03:34 -07:00
|
|
|
impl Service for RetransmitStage {
|
2018-10-10 16:49:41 -07:00
|
|
|
type JoinReturnType = ();
|
2018-07-03 21:14:08 -07:00
|
|
|
|
2018-10-10 16:49:41 -07:00
|
|
|
fn join(self) -> thread::Result<()> {
|
|
|
|
for thread_hdl in self.thread_hdls {
|
|
|
|
thread_hdl.join()?;
|
2018-07-03 21:14:08 -07:00
|
|
|
}
|
2018-10-10 16:49:41 -07:00
|
|
|
Ok(())
|
2018-07-03 21:14:08 -07:00
|
|
|
}
|
|
|
|
}
|