solana/src/broadcast_stage.rs

419 lines
14 KiB
Rust
Raw Normal View History

//! The `broadcast_stage` broadcasts data from a leader node to validators
//!
use counter::Counter;
use crdt::{Crdt, CrdtError, NodeInfo, LEADER_ROTATION_INTERVAL};
use entry::Entry;
#[cfg(feature = "erasure")]
use erasure;
use ledger::Block;
use log::Level;
use packet::{BlobRecycler, SharedBlobs};
use rayon::prelude::*;
use result::{Error, Result};
use service::Service;
use std::net::UdpSocket;
use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
use std::sync::mpsc::{Receiver, RecvTimeoutError};
use std::sync::{Arc, RwLock};
use std::thread::{self, Builder, JoinHandle};
use std::time::{Duration, Instant};
use timing::duration_as_ms;
2018-09-07 12:38:48 -07:00
use window::{self, SharedWindow, WindowIndex, WindowUtil, WINDOW_SIZE};
#[derive(Debug, PartialEq, Eq, Clone)]
pub enum BroadcastStageReturnType {
LeaderRotation,
ChannelDisconnected,
}
fn broadcast(
node_info: &NodeInfo,
broadcast_table: &[NodeInfo],
window: &SharedWindow,
recycler: &BlobRecycler,
receiver: &Receiver<Vec<Entry>>,
sock: &UdpSocket,
transmit_index: &mut WindowIndex,
receive_index: &mut u64,
) -> Result<()> {
let id = node_info.id;
let timer = Duration::new(1, 0);
let entries = receiver.recv_timeout(timer)?;
let mut num_entries = entries.len();
let mut ventries = Vec::new();
ventries.push(entries);
while let Ok(entries) = receiver.try_recv() {
num_entries += entries.len();
ventries.push(entries);
}
let to_blobs_start = Instant::now();
let dq: SharedBlobs = ventries
.into_par_iter()
.flat_map(|p| p.to_blobs(recycler))
.collect();
let to_blobs_elapsed = duration_as_ms(&to_blobs_start.elapsed());
// flatten deque to vec
let blobs_vec: Vec<_> = dq.into_iter().collect();
let blobs_chunking = Instant::now();
// We could receive more blobs than window slots so
// break them up into window-sized chunks to process
let blobs_chunked = blobs_vec.chunks(WINDOW_SIZE as usize).map(|x| x.to_vec());
let chunking_elapsed = duration_as_ms(&blobs_chunking.elapsed());
2018-09-07 12:38:48 -07:00
trace!("{}", window.read().unwrap().print(&id, *receive_index));
let broadcast_start = Instant::now();
for mut blobs in blobs_chunked {
let blobs_len = blobs.len();
trace!("{}: broadcast blobs.len: {}", id, blobs_len);
// Index the blobs
window::index_blobs(node_info, &blobs, receive_index)
.expect("index blobs for initial window");
// keep the cache of blobs that are broadcast
inc_new_counter_info!("streamer-broadcast-sent", blobs.len());
{
let mut win = window.write().unwrap();
assert!(blobs.len() <= win.len());
for b in &blobs {
let ix = b.read().unwrap().get_index().expect("blob index");
let pos = (ix % WINDOW_SIZE) as usize;
if let Some(x) = win[pos].data.take() {
trace!(
"{} popped {} at {}",
id,
x.read().unwrap().get_index().unwrap(),
pos
);
recycler.recycle(x, "broadcast-data");
}
if let Some(x) = win[pos].coding.take() {
trace!(
"{} popped {} at {}",
id,
x.read().unwrap().get_index().unwrap(),
pos
);
recycler.recycle(x, "broadcast-coding");
}
trace!("{} null {}", id, pos);
}
while let Some(b) = blobs.pop() {
let ix = b.read().unwrap().get_index().expect("blob index");
let pos = (ix % WINDOW_SIZE) as usize;
trace!("{} caching {} at {}", id, ix, pos);
assert!(win[pos].data.is_none());
win[pos].data = Some(b);
}
}
// Fill in the coding blob data from the window data blobs
#[cfg(feature = "erasure")]
{
erasure::generate_coding(
&id,
&mut window.write().unwrap(),
recycler,
*receive_index,
blobs_len,
&mut transmit_index.coding,
)?;
}
*receive_index += blobs_len as u64;
// Send blobs out from the window
Crdt::broadcast(
&node_info,
&broadcast_table,
&window,
&sock,
transmit_index,
*receive_index,
)?;
}
let broadcast_elapsed = duration_as_ms(&broadcast_start.elapsed());
info!(
"broadcast: {} entries, blob time {} chunking time {} broadcast time {}",
num_entries, to_blobs_elapsed, chunking_elapsed, broadcast_elapsed
);
Ok(())
}
// Implement a destructor for the BroadcastStage thread to signal it exited
// even on panics
struct Finalizer {
exit_sender: Arc<AtomicBool>,
}
impl Finalizer {
fn new(exit_sender: Arc<AtomicBool>) -> Self {
Finalizer { exit_sender }
}
}
// Implement a destructor for Finalizer.
impl Drop for Finalizer {
fn drop(&mut self) {
self.exit_sender.clone().store(true, Ordering::Relaxed);
}
}
pub struct BroadcastStage {
thread_hdl: JoinHandle<BroadcastStageReturnType>,
}
impl BroadcastStage {
fn run(
2018-08-09 15:20:13 -07:00
sock: &UdpSocket,
crdt: &Arc<RwLock<Crdt>>,
window: &SharedWindow,
entry_height: u64,
2018-08-09 15:20:13 -07:00
recycler: &BlobRecycler,
receiver: &Receiver<Vec<Entry>>,
) -> BroadcastStageReturnType {
let mut transmit_index = WindowIndex {
data: entry_height,
coding: entry_height,
};
let mut receive_index = entry_height;
let me = crdt.read().unwrap().my_data().clone();
loop {
if transmit_index.data % (LEADER_ROTATION_INTERVAL as u64) == 0 {
let rcrdt = crdt.read().unwrap();
let my_id = rcrdt.my_data().id;
match rcrdt.get_scheduled_leader(transmit_index.data) {
Some(id) if id == my_id => (),
2018-09-13 19:02:14 -07:00
// If the leader stays in power for the next
// round as well, then we don't exit. Otherwise, exit.
_ => {
return BroadcastStageReturnType::LeaderRotation;
}
}
}
let broadcast_table = crdt.read().unwrap().compute_broadcast_table();
if let Err(e) = broadcast(
&me,
&broadcast_table,
&window,
&recycler,
&receiver,
&sock,
&mut transmit_index,
&mut receive_index,
) {
match e {
Error::RecvTimeoutError(RecvTimeoutError::Disconnected) => {
return BroadcastStageReturnType::ChannelDisconnected
}
Error::RecvTimeoutError(RecvTimeoutError::Timeout) => (),
Error::CrdtError(CrdtError::NoPeers) => (), // TODO: Why are the unit-tests throwing hundreds of these?
_ => {
inc_new_counter_info!("streamer-broadcaster-error", 1, 1);
error!("broadcaster error: {:?}", e);
}
}
}
}
}
/// Service to broadcast messages from the leader to layer 1 nodes.
/// See `crdt` for network layer definitions.
/// # Arguments
/// * `sock` - Socket to send from.
/// * `exit` - Boolean to signal system exit.
/// * `crdt` - CRDT structure
/// * `window` - Cache of blobs that we have broadcast
/// * `recycler` - Blob recycler.
/// * `receiver` - Receive channel for blobs to be retransmitted to all the layer 1 nodes.
/// * `exit_sender` - Set to true when this stage exits, allows rest of Tpu to exit cleanly. Otherwise,
/// when a Tpu stage closes, it only closes the stages that come after it. The stages
/// that come before could be blocked on a receive, and never notice that they need to
/// exit. Now, if any stage of the Tpu closes, it will lead to closing the WriteStage (b/c
/// WriteStage is the last stage in the pipeline), which will then close Broadcast stage,
/// which will then close FetchStage in the Tpu, and then the rest of the Tpu,
/// completing the cycle.
pub fn new(
sock: UdpSocket,
crdt: Arc<RwLock<Crdt>>,
window: SharedWindow,
entry_height: u64,
recycler: BlobRecycler,
receiver: Receiver<Vec<Entry>>,
exit_sender: Arc<AtomicBool>,
) -> Self {
let thread_hdl = Builder::new()
.name("solana-broadcaster".to_string())
.spawn(move || {
let _exit = Finalizer::new(exit_sender);
Self::run(&sock, &crdt, &window, entry_height, &recycler, &receiver)
})
.unwrap();
(BroadcastStage { thread_hdl })
}
}
impl Service for BroadcastStage {
type JoinReturnType = BroadcastStageReturnType;
fn join(self) -> thread::Result<BroadcastStageReturnType> {
self.thread_hdl.join()
}
}
#[cfg(test)]
mod tests {
use broadcast_stage::{BroadcastStage, BroadcastStageReturnType};
2018-09-13 19:02:14 -07:00
use crdt::{Crdt, Node, LEADER_ROTATION_INTERVAL};
use entry::Entry;
2018-09-13 19:02:14 -07:00
use mint::Mint;
use packet::BlobRecycler;
use recorder::Recorder;
use service::Service;
use signature::{Keypair, KeypairUtil, Pubkey};
use std::cmp;
use std::sync::atomic::AtomicBool;
use std::sync::mpsc::{channel, Sender};
2018-09-13 19:02:14 -07:00
use std::sync::{Arc, RwLock};
use window::{new_window_from_entries, SharedWindow};
2018-09-13 19:02:14 -07:00
fn setup_dummy_broadcast_stage() -> (
Pubkey,
Pubkey,
BroadcastStage,
SharedWindow,
Sender<Vec<Entry>>,
2018-09-13 19:02:14 -07:00
Arc<RwLock<Crdt>>,
Vec<Entry>,
) {
// Setup dummy leader info
let leader_keypair = Keypair::new();
let id = leader_keypair.pubkey();
let leader_info = Node::new_localhost_with_pubkey(leader_keypair.pubkey());
2018-09-13 19:02:14 -07:00
// Give the leader somebody to broadcast to so he isn't lonely
let buddy_keypair = Keypair::new();
let buddy_id = buddy_keypair.pubkey();
let broadcast_buddy = Node::new_localhost_with_pubkey(buddy_keypair.pubkey());
// Fill the crdt with the buddy's info
let mut crdt = Crdt::new(leader_info.info.clone()).expect("Crdt::new");
crdt.insert(&broadcast_buddy.info);
let crdt = Arc::new(RwLock::new(crdt));
let blob_recycler = BlobRecycler::default();
// Make dummy initial entries
let mint = Mint::new(10000);
let entries = mint.create_entries();
let entry_height = entries.len() as u64;
// Setup a window
let window =
new_window_from_entries(&entries, entry_height, &leader_info.info, &blob_recycler);
let shared_window = Arc::new(RwLock::new(window));
let (entry_sender, entry_receiver) = channel();
let exit_sender = Arc::new(AtomicBool::new(false));
// Start up the broadcast stage
let broadcast_stage = BroadcastStage::new(
leader_info.sockets.broadcast,
crdt.clone(),
shared_window.clone(),
entry_height,
blob_recycler.clone(),
entry_receiver,
exit_sender,
);
(
id,
buddy_id,
broadcast_stage,
shared_window,
entry_sender,
crdt,
entries,
)
}
fn find_highest_window_index(shared_window: &SharedWindow) -> u64 {
let window = shared_window.read().unwrap();
window.iter().fold(0, |m, w_slot| {
if let Some(ref blob) = w_slot.data {
cmp::max(m, blob.read().unwrap().get_index().unwrap())
} else {
m
}
})
}
#[test]
fn test_broadcast_stage_leader_rotation_exit() {
let (
id,
buddy_id,
broadcast_stage,
shared_window,
entry_sender,
crdt,
entries,
) = setup_dummy_broadcast_stage();
{
let mut wcrdt = crdt.write().unwrap();
// Set leader to myself
wcrdt.set_leader(id);
// Set the leader for the next rotation to also be myself
wcrdt.set_scheduled_leader(LEADER_ROTATION_INTERVAL, id);
}
let genesis_len = entries.len() as u64;
2018-09-13 19:02:14 -07:00
let last_entry_hash = entries.last().expect("Ledger should not be empty").id;
// Input enough entries to make exactly LEADER_ROTATION_INTERVAL entries, which will
// trigger a check for leader rotation. Because the next scheduled leader
// is ourselves, we won't exit
let mut recorder = Recorder::new(last_entry_hash);
for _ in genesis_len..LEADER_ROTATION_INTERVAL {
let new_entry = recorder.record(vec![]);
entry_sender.send(new_entry).unwrap();
}
// Set the scheduled next leader in the crdt to the other buddy on the network
crdt.write()
.unwrap()
.set_scheduled_leader(2 * LEADER_ROTATION_INTERVAL, buddy_id);
// Input another LEADER_ROTATION_INTERVAL dummy entries, which will take us
2018-09-13 19:02:14 -07:00
// past the point of the leader rotation. The write_stage will see that
// it's no longer the leader after checking the crdt, and exit
for _ in 0..LEADER_ROTATION_INTERVAL {
let new_entry = recorder.record(vec![]);
match entry_sender.send(new_entry) {
// We disconnected, break out of loop and check the results
Err(_) => break,
_ => (),
};
}
// Make sure the threads closed cleanly
assert_eq!(
broadcast_stage.join().unwrap(),
BroadcastStageReturnType::LeaderRotation
);
let highest_index = find_highest_window_index(&shared_window);
assert_eq!(highest_index, 2 * LEADER_ROTATION_INTERVAL - 1);
}
}