streamer send destination metrics for repair, gossip (#21564)

This commit is contained in:
Jeff Biseda 2021-12-17 15:21:05 -08:00 committed by GitHub
parent 76098dd42a
commit 97a1fa10a6
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 215 additions and 4 deletions

1
Cargo.lock generated
View File

@ -5960,6 +5960,7 @@ dependencies = [
name = "solana-streamer" name = "solana-streamer"
version = "1.10.0" version = "1.10.0"
dependencies = [ dependencies = [
"histogram",
"itertools 0.10.3", "itertools 0.10.3",
"libc", "libc",
"log 0.4.14", "log 0.4.14",

View File

@ -52,6 +52,7 @@ pub mod sigverify;
pub mod sigverify_shreds; pub mod sigverify_shreds;
pub mod sigverify_stage; pub mod sigverify_stage;
pub mod snapshot_packager_service; pub mod snapshot_packager_service;
pub mod stats_reporter_service;
pub mod system_monitor_service; pub mod system_monitor_service;
pub mod tower_storage; pub mod tower_storage;
pub mod tpu; pub mod tpu;

View File

@ -5,7 +5,11 @@ use {
solana_streamer::{socket::SocketAddrSpace, streamer}, solana_streamer::{socket::SocketAddrSpace, streamer},
std::{ std::{
net::UdpSocket, net::UdpSocket,
sync::{atomic::AtomicBool, mpsc::channel, Arc, RwLock}, sync::{
atomic::AtomicBool,
mpsc::{channel, Sender},
Arc, RwLock,
},
thread::{self, JoinHandle}, thread::{self, JoinHandle},
}, },
}; };
@ -20,6 +24,7 @@ impl ServeRepairService {
blockstore: Option<Arc<Blockstore>>, blockstore: Option<Arc<Blockstore>>,
serve_repair_socket: UdpSocket, serve_repair_socket: UdpSocket,
socket_addr_space: SocketAddrSpace, socket_addr_space: SocketAddrSpace,
stats_reporter_sender: Sender<Box<dyn FnOnce() + Send>>,
exit: &Arc<AtomicBool>, exit: &Arc<AtomicBool>,
) -> Self { ) -> Self {
let (request_sender, request_receiver) = channel(); let (request_sender, request_receiver) = channel();
@ -44,6 +49,7 @@ impl ServeRepairService {
serve_repair_socket, serve_repair_socket,
response_receiver, response_receiver,
socket_addr_space, socket_addr_space,
Some(stats_reporter_sender),
); );
let t_listen = ServeRepair::listen( let t_listen = ServeRepair::listen(
serve_repair.clone(), serve_repair.clone(),

View File

@ -0,0 +1,53 @@
use std::{
result::Result,
sync::{
atomic::{AtomicBool, Ordering},
mpsc::{Receiver, RecvTimeoutError},
Arc,
},
thread::{self, Builder, JoinHandle},
time::Duration,
};
pub struct StatsReporterService {
thread_hdl: JoinHandle<()>,
}
impl StatsReporterService {
pub fn new(
reporting_receiver: Receiver<Box<dyn FnOnce() + Send>>,
exit: &Arc<AtomicBool>,
) -> Self {
let exit = exit.clone();
let thread_hdl = Builder::new()
.name("solana-stats-reporter".to_owned())
.spawn(move || loop {
if exit.load(Ordering::Relaxed) {
return;
}
if let Err(e) = Self::receive_reporting_func(&reporting_receiver) {
match e {
RecvTimeoutError::Disconnected => break,
RecvTimeoutError::Timeout => (),
}
}
})
.unwrap();
Self { thread_hdl }
}
pub fn join(self) -> thread::Result<()> {
self.thread_hdl.join()?;
Ok(())
}
fn receive_reporting_func(
r: &Receiver<Box<dyn FnOnce() + Send>>,
) -> Result<(), RecvTimeoutError> {
let timer = Duration::new(1, 0);
let func = r.recv_timeout(timer)?;
func();
Ok(())
}
}

View File

@ -14,6 +14,7 @@ use {
serve_repair_service::ServeRepairService, serve_repair_service::ServeRepairService,
sigverify, sigverify,
snapshot_packager_service::SnapshotPackagerService, snapshot_packager_service::SnapshotPackagerService,
stats_reporter_service::StatsReporterService,
system_monitor_service::{verify_udp_stats_access, SystemMonitorService}, system_monitor_service::{verify_udp_stats_access, SystemMonitorService},
tower_storage::TowerStorage, tower_storage::TowerStorage,
tpu::{Tpu, DEFAULT_TPU_COALESCE_MS}, tpu::{Tpu, DEFAULT_TPU_COALESCE_MS},
@ -276,6 +277,7 @@ pub struct Validator {
cache_block_meta_service: Option<CacheBlockMetaService>, cache_block_meta_service: Option<CacheBlockMetaService>,
system_monitor_service: Option<SystemMonitorService>, system_monitor_service: Option<SystemMonitorService>,
sample_performance_service: Option<SamplePerformanceService>, sample_performance_service: Option<SamplePerformanceService>,
stats_reporter_service: StatsReporterService,
gossip_service: GossipService, gossip_service: GossipService,
serve_repair_service: ServeRepairService, serve_repair_service: ServeRepairService,
completed_data_sets_service: CompletedDataSetsService, completed_data_sets_service: CompletedDataSetsService,
@ -697,12 +699,17 @@ impl Validator {
Some(node.info.shred_version), Some(node.info.shred_version),
)), )),
}; };
let (stats_reporter_sender, stats_reporter_receiver) = channel();
let stats_reporter_service = StatsReporterService::new(stats_reporter_receiver, &exit);
let gossip_service = GossipService::new( let gossip_service = GossipService::new(
&cluster_info, &cluster_info,
Some(bank_forks.clone()), Some(bank_forks.clone()),
node.sockets.gossip, node.sockets.gossip,
config.gossip_validators.clone(), config.gossip_validators.clone(),
should_check_duplicate_instance, should_check_duplicate_instance,
Some(stats_reporter_sender.clone()),
&exit, &exit,
); );
let serve_repair = Arc::new(RwLock::new(ServeRepair::new(cluster_info.clone()))); let serve_repair = Arc::new(RwLock::new(ServeRepair::new(cluster_info.clone())));
@ -711,6 +718,7 @@ impl Validator {
Some(blockstore.clone()), Some(blockstore.clone()),
node.sockets.serve_repair, node.sockets.serve_repair,
socket_addr_space, socket_addr_space,
stats_reporter_sender,
&exit, &exit,
); );
@ -904,6 +912,7 @@ impl Validator {
*start_progress.write().unwrap() = ValidatorStartProgress::Running; *start_progress.write().unwrap() = ValidatorStartProgress::Running;
Self { Self {
stats_reporter_service,
gossip_service, gossip_service,
serve_repair_service, serve_repair_service,
json_rpc_service, json_rpc_service,
@ -1028,6 +1037,9 @@ impl Validator {
self.serve_repair_service self.serve_repair_service
.join() .join()
.expect("serve_repair_service"); .expect("serve_repair_service");
self.stats_reporter_service
.join()
.expect("stats_reporter_service");
self.tpu.join().expect("tpu"); self.tpu.join().expect("tpu");
self.tvu.join().expect("tvu"); self.tvu.join().expect("tvu");
self.completed_data_sets_service self.completed_data_sets_service

View File

@ -19,7 +19,7 @@ use {
net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener, UdpSocket}, net::{IpAddr, Ipv4Addr, SocketAddr, TcpListener, UdpSocket},
sync::{ sync::{
atomic::{AtomicBool, Ordering}, atomic::{AtomicBool, Ordering},
mpsc::channel, mpsc::{channel, Sender},
Arc, RwLock, Arc, RwLock,
}, },
thread::{self, sleep, JoinHandle}, thread::{self, sleep, JoinHandle},
@ -38,6 +38,7 @@ impl GossipService {
gossip_socket: UdpSocket, gossip_socket: UdpSocket,
gossip_validators: Option<HashSet<Pubkey>>, gossip_validators: Option<HashSet<Pubkey>>,
should_check_duplicate_instance: bool, should_check_duplicate_instance: bool,
stats_reporter_sender: Option<Sender<Box<dyn FnOnce() + Send>>>,
exit: &Arc<AtomicBool>, exit: &Arc<AtomicBool>,
) -> Self { ) -> Self {
let (request_sender, request_receiver) = channel(); let (request_sender, request_receiver) = channel();
@ -88,6 +89,7 @@ impl GossipService {
gossip_socket, gossip_socket,
response_receiver, response_receiver,
socket_addr_space, socket_addr_space,
stats_reporter_sender,
); );
let thread_hdls = vec![ let thread_hdls = vec![
t_receiver, t_receiver,
@ -331,6 +333,7 @@ pub fn make_gossip_node(
gossip_socket, gossip_socket,
None, None,
should_check_duplicate_instance, should_check_duplicate_instance,
None,
exit, exit,
); );
(gossip_service, ip_echo, cluster_info) (gossip_service, ip_echo, cluster_info)
@ -362,6 +365,7 @@ mod tests {
tn.sockets.gossip, tn.sockets.gossip,
None, None,
true, // should_check_duplicate_instance true, // should_check_duplicate_instance
None,
&exit, &exit,
); );
exit.store(true, Ordering::Relaxed); exit.store(true, Ordering::Relaxed);

View File

@ -45,6 +45,7 @@ fn test_node(exit: &Arc<AtomicBool>) -> (Arc<ClusterInfo>, GossipService, UdpSoc
test_node.sockets.gossip, test_node.sockets.gossip,
None, None,
true, // should_check_duplicate_instance true, // should_check_duplicate_instance
None,
exit, exit,
); );
let _ = cluster_info.my_contact_info(); let _ = cluster_info.my_contact_info();
@ -72,6 +73,7 @@ fn test_node_with_bank(
test_node.sockets.gossip, test_node.sockets.gossip,
None, None,
true, // should_check_duplicate_instance true, // should_check_duplicate_instance
None,
exit, exit,
); );
let _ = cluster_info.my_contact_info(); let _ = cluster_info.my_contact_info();

View File

@ -221,6 +221,7 @@ fn start_gossip_node(
gossip_socket, gossip_socket,
gossip_validators, gossip_validators,
should_check_duplicate_instance, should_check_duplicate_instance,
None,
&gossip_exit_flag, &gossip_exit_flag,
); );
info!("Started gossip node"); info!("Started gossip node");

View File

@ -10,6 +10,7 @@ documentation = "https://docs.rs/solana-streamer"
edition = "2021" edition = "2021"
[dependencies] [dependencies]
histogram = "0.6.9"
itertools = "0.10.3" itertools = "0.10.3"
log = "0.4.14" log = "0.4.14"
solana-metrics = { path = "../metrics", version = "=1.10.0" } solana-metrics = { path = "../metrics", version = "=1.10.0" }

View File

@ -7,8 +7,11 @@ use {
recvmmsg::NUM_RCVMMSGS, recvmmsg::NUM_RCVMMSGS,
socket::SocketAddrSpace, socket::SocketAddrSpace,
}, },
solana_sdk::timing::timestamp, histogram::Histogram,
solana_sdk::{packet::Packet, timing::timestamp},
std::{ std::{
cmp::Reverse,
collections::HashMap,
net::UdpSocket, net::UdpSocket,
sync::{ sync::{
atomic::{AtomicBool, Ordering}, atomic::{AtomicBool, Ordering},
@ -119,13 +122,126 @@ pub fn receiver(
.unwrap() .unwrap()
} }
#[derive(Debug, Default)]
struct SendStats {
bytes: u64,
count: u64,
}
#[derive(Default)]
struct StreamerSendStats {
host_map: HashMap<[u16; 8], SendStats>,
since: Option<Instant>,
}
impl StreamerSendStats {
fn report_stats(
name: &'static str,
host_map: HashMap<[u16; 8], SendStats>,
sample_duration: Option<Duration>,
) {
const MAX_REPORT_ENTRIES: usize = 5;
let sample_ms = sample_duration.map(|d| d.as_millis()).unwrap_or_default();
let mut hist = Histogram::default();
let mut byte_sum = 0;
let mut pkt_count = 0;
host_map.iter().for_each(|(_addr, host_stats)| {
hist.increment(host_stats.bytes).unwrap();
byte_sum += host_stats.bytes;
pkt_count += host_stats.count;
});
datapoint_info!(
name,
("streamer-send-sample_duration_ms", sample_ms, i64),
("streamer-send-host_count", host_map.len(), i64),
("streamer-send-bytes_total", byte_sum, i64),
("streamer-send-pkt_count_total", pkt_count, i64),
(
"streamer-send-host_bytes_min",
hist.minimum().unwrap_or_default(),
i64
),
(
"streamer-send-host_bytes_max",
hist.maximum().unwrap_or_default(),
i64
),
(
"streamer-send-host_bytes_mean",
hist.mean().unwrap_or_default(),
i64
),
(
"streamer-send-host_bytes_90pct",
hist.percentile(90.0).unwrap_or_default(),
i64
),
(
"streamer-send-host_bytes_50pct",
hist.percentile(50.0).unwrap_or_default(),
i64
),
(
"streamer-send-host_bytes_10pct",
hist.percentile(10.0).unwrap_or_default(),
i64
),
);
let num_entries = host_map.len();
let mut entries: Vec<_> = host_map.into_iter().collect();
if entries.len() > MAX_REPORT_ENTRIES {
entries.select_nth_unstable_by_key(MAX_REPORT_ENTRIES, |(_addr, stats)| {
Reverse(stats.bytes)
});
entries.truncate(MAX_REPORT_ENTRIES);
}
info!(
"streamer send {} hosts: count:{} {:?}",
name, num_entries, entries,
);
}
fn maybe_submit(&mut self, name: &'static str, sender: &Sender<Box<dyn FnOnce() + Send>>) {
const SUBMIT_CADENCE: Duration = Duration::from_secs(10);
const MAP_SIZE_REPORTING_THRESHOLD: usize = 1_000;
let elapsed = self.since.as_ref().map(Instant::elapsed);
if elapsed.map(|e| e < SUBMIT_CADENCE).unwrap_or_default()
&& self.host_map.len() < MAP_SIZE_REPORTING_THRESHOLD
{
return;
}
let host_map = std::mem::take(&mut self.host_map);
let _ = sender.send(Box::new(move || {
Self::report_stats(name, host_map, elapsed);
}));
*self = Self {
since: Some(Instant::now()),
..Self::default()
};
}
fn record(&mut self, pkt: &Packet) {
let ent = self.host_map.entry(pkt.meta.addr).or_default();
ent.count += 1;
ent.bytes += pkt.data.len() as u64;
}
}
fn recv_send( fn recv_send(
sock: &UdpSocket, sock: &UdpSocket,
r: &PacketBatchReceiver, r: &PacketBatchReceiver,
socket_addr_space: &SocketAddrSpace, socket_addr_space: &SocketAddrSpace,
stats: &mut Option<StreamerSendStats>,
) -> Result<()> { ) -> Result<()> {
let timer = Duration::new(1, 0); let timer = Duration::new(1, 0);
let packet_batch = r.recv_timeout(timer)?; let packet_batch = r.recv_timeout(timer)?;
if let Some(stats) = stats {
packet_batch.packets.iter().for_each(|p| stats.record(p));
}
send_to(&packet_batch, sock, socket_addr_space)?; send_to(&packet_batch, sock, socket_addr_space)?;
Ok(()) Ok(())
} }
@ -158,6 +274,7 @@ pub fn responder(
sock: Arc<UdpSocket>, sock: Arc<UdpSocket>,
r: PacketBatchReceiver, r: PacketBatchReceiver,
socket_addr_space: SocketAddrSpace, socket_addr_space: SocketAddrSpace,
stats_reporter_sender: Option<Sender<Box<dyn FnOnce() + Send>>>,
) -> JoinHandle<()> { ) -> JoinHandle<()> {
Builder::new() Builder::new()
.name(format!("solana-responder-{}", name)) .name(format!("solana-responder-{}", name))
@ -165,8 +282,14 @@ pub fn responder(
let mut errors = 0; let mut errors = 0;
let mut last_error = None; let mut last_error = None;
let mut last_print = 0; let mut last_print = 0;
let mut stats = None;
if stats_reporter_sender.is_some() {
stats = Some(StreamerSendStats::default());
}
loop { loop {
if let Err(e) = recv_send(&sock, &r, &socket_addr_space) { if let Err(e) = recv_send(&sock, &r, &socket_addr_space, &mut stats) {
match e { match e {
StreamerError::RecvTimeout(RecvTimeoutError::Disconnected) => break, StreamerError::RecvTimeout(RecvTimeoutError::Disconnected) => break,
StreamerError::RecvTimeout(RecvTimeoutError::Timeout) => (), StreamerError::RecvTimeout(RecvTimeoutError::Timeout) => (),
@ -183,6 +306,11 @@ pub fn responder(
last_print = now; last_print = now;
errors = 0; errors = 0;
} }
if let Some(ref stats_reporter_sender) = stats_reporter_sender {
if let Some(ref mut stats) = stats {
stats.maybe_submit(name, stats_reporter_sender);
}
}
} }
}) })
.unwrap() .unwrap()
@ -255,6 +383,7 @@ mod test {
Arc::new(send), Arc::new(send),
r_responder, r_responder,
SocketAddrSpace::Unspecified, SocketAddrSpace::Unspecified,
None,
); );
let mut packet_batch = PacketBatch::default(); let mut packet_batch = PacketBatch::default();
for i in 0..5 { for i in 0..5 {

View File

@ -231,6 +231,7 @@ fn start_gossip_node(
gossip_socket, gossip_socket,
gossip_validators, gossip_validators,
should_check_duplicate_instance, should_check_duplicate_instance,
None,
&gossip_exit_flag, &gossip_exit_flag,
); );
(cluster_info, gossip_exit_flag, gossip_service) (cluster_info, gossip_exit_flag, gossip_service)