add shutdown for agent

This commit is contained in:
GroovieGermanikus 2023-08-09 13:29:11 +02:00
parent 47c95d549e
commit 4ca4e035c2
6 changed files with 261 additions and 78 deletions

View File

@ -0,0 +1,80 @@
use log::trace;
use std::sync::atomic::{AtomicI64, Ordering};
use std::time::{Duration, SystemTime, UNIX_EPOCH};
pub struct Debouncer {
cooldown_ms: i64,
last: AtomicI64,
}
impl Debouncer {
pub fn new(cooldown: Duration) -> Self {
Self {
cooldown_ms: cooldown.as_millis() as i64,
last: AtomicI64::new(-1),
}
}
pub fn can_fire(&self) -> bool {
let now = SystemTime::now();
let epoch_now = now.duration_since(UNIX_EPOCH).unwrap().as_millis() as i64;
let results = self
.last
.fetch_update(Ordering::SeqCst, Ordering::SeqCst, |last| {
let elapsed = epoch_now - last;
if elapsed > self.cooldown_ms {
trace!("trigger it!");
Some(epoch_now)
} else {
trace!("have to wait - not yet .. (elapsed {})", elapsed);
None
}
}); // -- compare+swap
results.is_ok()
}
}
#[cfg(test)]
mod tests {
use crate::outbound::debouncer::Debouncer;
use std::sync::Arc;
use std::thread;
use std::thread::sleep;
use std::time::Duration;
#[test]
fn fire() {
let debouncer = Debouncer::new(Duration::from_millis(500));
assert!(debouncer.can_fire());
assert!(!debouncer.can_fire());
sleep(Duration::from_millis(200));
assert!(!debouncer.can_fire());
sleep(Duration::from_millis(400));
assert!(debouncer.can_fire());
}
#[test]
fn threading() {
let debouncer = Debouncer::new(Duration::from_millis(500));
thread::spawn(move || {
debouncer.can_fire();
});
}
#[test]
fn shared() {
let debouncer = Arc::new(Debouncer::new(Duration::from_millis(500)));
let debouncer_copy = debouncer.clone();
thread::spawn(move || {
debouncer_copy.can_fire();
});
thread::spawn(move || {
debouncer.can_fire();
});
}
}

View File

@ -1,2 +1,3 @@
mod debouncer;
mod sharder;
pub mod tx_forward;

View File

@ -1,3 +1,4 @@
use crate::outbound::debouncer::Debouncer;
use crate::outbound::sharder::Sharder;
use crate::quic_util::SkipServerVerification;
use crate::quinn_auto_reconnect::AutoReconnect;
@ -18,24 +19,24 @@ use std::net::{IpAddr, Ipv4Addr, SocketAddr};
use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
use std::sync::Arc;
use std::time::{Duration, Instant};
use itertools::Itertools;
use tokio::sync::mpsc::Receiver;
const MAX_PARALLEL_STREAMS: usize = 6;
pub const PARALLEL_TPU_CONNECTION_COUNT: usize = 4;
const AGENT_SHUTDOWN_IDLE: u64 = 2500; // ms; should be 4x400ms+buffer
struct AgentHandle {
pub tpu_address: SocketAddr,
pub agent_exit_signal: Arc<AtomicBool>,
pub created_at: Instant,
// relative to start
pub last_used_ms: AtomicU64,
pub age_ms: AtomicU64,
}
impl AgentHandle {
pub fn touch(&self) {
let last_used_ms = Instant::now().duration_since(self.created_at).as_millis() as u64;
self.last_used_ms.store(last_used_ms, Ordering::Relaxed);
let age_ms = Instant::now().duration_since(self.created_at).as_millis() as u64;
self.age_ms.store(age_ms, Ordering::Relaxed);
}
}
@ -52,6 +53,7 @@ pub async fn tx_forwarder(
let (broadcast_in, _) = tokio::sync::broadcast::channel::<Arc<ForwardPacket>>(1000);
let mut agents: HashMap<SocketAddr, AgentHandle> = HashMap::new();
let agent_shutdown_debouncer = Debouncer::new(Duration::from_millis(200));
loop {
if exit_signal.load(Ordering::Relaxed) {
@ -99,7 +101,7 @@ pub async fn tx_forwarder(
}
if let Err(_elapsed) = timeout_result {
continue;
continue 'tx_channel_loop;
}
let maybe_packet = timeout_result.unwrap();
@ -110,20 +112,20 @@ pub async fn tx_forwarder(
let packet = maybe_packet.unwrap();
if packet.tpu_address != tpu_address {
continue;
continue 'tx_channel_loop;
}
if !sharder.matching(packet.shard_hash) {
continue;
continue 'tx_channel_loop;
}
let mut transactions_batch: Vec<Vec<u8>> = packet.transactions.clone();
while let Ok(more) = per_connection_receiver.try_recv() {
'more: while let Ok(more) = per_connection_receiver.try_recv() {
if more.tpu_address != tpu_address {
continue;
continue 'more;
}
if !sharder.matching(more.shard_hash) {
continue;
continue 'more;
}
transactions_batch.extend(more.transactions.clone());
}
@ -170,21 +172,21 @@ pub async fn tx_forwarder(
tpu_address,
agent_exit_signal,
created_at: now,
last_used_ms: AtomicU64::new(0),
age_ms: AtomicU64::new(0),
}
}); // -- new agent
let agent = agents.get(&tpu_address).unwrap();
agent.touch();
// TODO only call from time to time
cleanup_agents(&mut agents, &tpu_address);
if agent_shutdown_debouncer.can_fire() {
cleanup_agents(&mut agents, &tpu_address);
}
if broadcast_in.len() > 5 {
debug!("tx-forward queue len: {}", broadcast_in.len())
}
// TODO use agent_exit signal to clean them up
broadcast_in
.send(forward_packet)
.expect("send must succeed");
@ -193,9 +195,6 @@ pub async fn tx_forwarder(
// not reachable
}
// ms
const AGENT_SHUTDOWN_IDLE: u64 = 5_000;
fn cleanup_agents(agents: &mut HashMap<SocketAddr, AgentHandle>, current_tpu_address: &SocketAddr) {
let mut to_shutdown = Vec::new();
for (tpu_address, handle) in &*agents {
@ -203,7 +202,7 @@ fn cleanup_agents(agents: &mut HashMap<SocketAddr, AgentHandle>, current_tpu_add
continue;
}
let last_used_ms = handle.last_used_ms.load(Ordering::Relaxed);
let last_used_ms = handle.age_ms.load(Ordering::Relaxed);
if last_used_ms > AGENT_SHUTDOWN_IDLE {
to_shutdown.push(tpu_address.to_owned())
@ -211,15 +210,20 @@ fn cleanup_agents(agents: &mut HashMap<SocketAddr, AgentHandle>, current_tpu_add
}
for tpu_address in to_shutdown.iter() {
if let Some(removed_agent) = agents.remove(&tpu_address) {
if let Ok(_) = removed_agent.agent_exit_signal.compare_exchange(
false, true, Ordering::Relaxed, Ordering::Relaxed) {
let last_used_ms = removed_agent.last_used_ms.load(Ordering::Relaxed);
debug!("Agent for tpu node {} was IDLE for {}ms - sending exit signal", removed_agent.tpu_address, last_used_ms);
if let Some(removed_agent) = agents.remove(tpu_address) {
let was_signaled = removed_agent
.agent_exit_signal
.compare_exchange(false, true, Ordering::SeqCst, Ordering::Relaxed)
.is_ok();
if was_signaled {
let last_used_ms = removed_agent.age_ms.load(Ordering::Relaxed);
debug!(
"Idle Agent for tpu node {} idle for {}ms - sending exit signal",
removed_agent.tpu_address, last_used_ms
);
}
}
}
}
/// takes a validator identity and creates a new QUIC client; appears as staked peer to TPU

View File

@ -1,11 +1,12 @@
use crate::util::timeout_fallback;
use anyhow::{bail, Context};
use log::{info, warn};
use quinn::{Connection, ConnectionError, Endpoint};
use std::fmt;
use std::net::SocketAddr;
use std::sync::atomic::{AtomicU32, Ordering};
use std::time::Duration;
use tokio::sync::RwLock;
use tokio::time::timeout;
use tracing::debug;
/// connection manager with automatic reconnect; designated for connection to Solana TPU nodes
@ -17,6 +18,8 @@ use tracing::debug;
/// * TPU address might be wrong which then is a permanent problem
/// * the ActiveConnection instance gets renewed on leader schedule change
const SEND_TIMEOUT: Duration = Duration::from_secs(5);
enum ConnectionState {
NotConnected,
Connection(Connection),
@ -42,7 +45,7 @@ impl AutoReconnect {
}
pub async fn send_uni(&self, payload: &Vec<u8>) -> anyhow::Result<()> {
let mut send_stream = timeout_fallback(self.refresh_and_get().await?.open_uni())
let mut send_stream = timeout(SEND_TIMEOUT, self.refresh_and_get().await?.open_uni())
.await
.context("open uni stream for sending")??;
send_stream.write_all(payload.as_slice()).await?;
@ -67,7 +70,11 @@ impl AutoReconnect {
let lock = self.current.read().await;
if let ConnectionState::Connection(conn) = &*lock {
if conn.close_reason().is_none() {
debug!("Reuse connection {} to {}", conn.stable_id(), self.target_address);
debug!(
"Reuse connection {} to {}",
conn.stable_id(),
self.target_address
);
return;
}
}
@ -92,16 +99,17 @@ impl AutoReconnect {
if reconnect_count < 10 {
info!(
"Replace closed connection {} with {} (retry {})",
"Replace closed connection {} with {} to target {} (retry {})",
old_stable_id,
new_connection.stable_id(),
self.target_address,
reconnect_count
);
} else {
*lock = ConnectionState::PermanentError;
warn!(
"Too many reconnect attempts {} with {} (retry {})",
old_stable_id,
"Too many reconnect attempts to {}, last one with {} (retry {})",
self.target_address,
new_connection.stable_id(),
reconnect_count
);
@ -116,7 +124,11 @@ impl AutoReconnect {
}
};
} else {
debug!("Reuse connection {} to {} with write-lock", current.stable_id(), self.target_address);
debug!(
"Reuse connection {} to {} with write-lock",
current.stable_id(),
self.target_address
);
}
}
ConnectionState::NotConnected => {
@ -142,7 +154,10 @@ impl AutoReconnect {
}
ConnectionState::PermanentError => {
// no nothing
debug!("Not using connection to {} with permanent error", self.target_address);
debug!(
"Not using connection to {} with permanent error",
self.target_address
);
}
}
}

View File

@ -230,7 +230,7 @@ impl QuicProxyConnectionManager {
let proxy_request_raw =
bincode::serialize(&forwarding_request).expect("Expect to serialize transactions");
let send_result = auto_connection.send_uni(proxy_request_raw).await;
let send_result = auto_connection.send_uni(&proxy_request_raw).await;
match send_result {
Ok(()) => {

View File

@ -1,6 +1,6 @@
use anyhow::Context;
use log::warn;
use quinn::{Connection, Endpoint};
use anyhow::{bail, Context};
use log::{info, warn};
use quinn::{Connection, ConnectionError, Endpoint};
use std::fmt;
use std::net::SocketAddr;
use std::sync::atomic::{AtomicU32, Ordering};
@ -9,11 +9,22 @@ use tokio::sync::RwLock;
use tokio::time::timeout;
use tracing::debug;
/// copy of quic-proxy AutoReconnect - used that for reference
const SEND_TIMEOUT: Duration = Duration::from_secs(5);
enum ConnectionState {
NotConnected,
Connection(Connection),
PermanentError,
}
pub struct AutoReconnect {
// endoint should be configures with keep-alive and idle timeout
endpoint: Endpoint,
current: RwLock<Option<Connection>>,
current: RwLock<ConnectionState>,
pub target_address: SocketAddr,
reconnect_count: AtomicU32,
}
@ -22,15 +33,14 @@ impl AutoReconnect {
pub fn new(endpoint: Endpoint, target_address: SocketAddr) -> Self {
Self {
endpoint,
current: RwLock::new(None),
current: RwLock::new(ConnectionState::NotConnected),
target_address,
reconnect_count: AtomicU32::new(0),
}
}
pub async fn send_uni(&self, payload: Vec<u8>) -> anyhow::Result<()> {
// TOOD do smart error handling + reconnect
let mut send_stream = timeout(Duration::from_secs(4), self.refresh().await.open_uni())
pub async fn send_uni(&self, payload: &Vec<u8>) -> anyhow::Result<()> {
let mut send_stream = timeout(SEND_TIMEOUT, self.refresh_and_get().await?.open_uni())
.await
.context("open uni stream for sending")??;
send_stream.write_all(payload.as_slice()).await?;
@ -38,69 +48,142 @@ impl AutoReconnect {
Ok(())
}
pub async fn refresh(&self) -> Connection {
pub async fn refresh_and_get(&self) -> anyhow::Result<Connection> {
self.refresh().await;
let lock = self.current.read().await;
match &*lock {
ConnectionState::NotConnected => bail!("not connected"),
ConnectionState::Connection(conn) => Ok(conn.clone()),
ConnectionState::PermanentError => bail!("permanent error"),
}
}
pub async fn refresh(&self) {
{
// first check for existing connection using a cheap read-lock
let lock = self.current.read().await;
let maybe_conn = lock.as_ref();
if maybe_conn
.filter(|conn| conn.close_reason().is_none())
.is_some()
{
let reuse = maybe_conn.unwrap();
debug!("Reuse connection {}", reuse.stable_id());
return reuse.clone();
if let ConnectionState::Connection(conn) = &*lock {
if conn.close_reason().is_none() {
debug!("Reuse connection {} to {}", conn.stable_id(), self.target_address);
return;
}
}
}
let mut lock = self.current.write().await;
let maybe_conn = lock.as_ref();
match maybe_conn {
Some(current) => {
match &*lock {
ConnectionState::Connection(current) => {
if current.close_reason().is_some() {
let old_stable_id = current.stable_id();
warn!(
"Connection {} is closed for reason: {:?}",
"Connection {} to {} is closed for reason: {:?}",
old_stable_id,
self.target_address,
current.close_reason()
);
let new_connection = self.create_connection().await;
*lock = Some(new_connection.clone());
// let old_conn = lock.replace(new_connection.clone());
self.reconnect_count.fetch_add(1, Ordering::SeqCst);
match self.create_connection().await {
Some(new_connection) => {
*lock = ConnectionState::Connection(new_connection.clone());
let reconnect_count =
self.reconnect_count.fetch_add(1, Ordering::SeqCst);
debug!(
"Replace closed connection {} with {} (retry {})",
old_stable_id,
new_connection.stable_id(),
self.reconnect_count.load(Ordering::SeqCst)
);
new_connection
if reconnect_count < 10 {
info!(
"Replace closed connection {} with {} to target {} (retry {})",
old_stable_id,
new_connection.stable_id(),
self.target_address,
reconnect_count
);
} else {
*lock = ConnectionState::PermanentError;
warn!(
"Too many reconnect attempts to {}, last one with {} (retry {})",
self.target_address,
new_connection.stable_id(),
reconnect_count
);
}
}
None => {
warn!(
"Reconnect to {} failed for connection {}",
self.target_address, old_stable_id
);
*lock = ConnectionState::PermanentError;
}
};
} else {
debug!("Reuse connection {} with write-lock", current.stable_id());
current.clone()
debug!("Reuse connection {} to {} with write-lock", current.stable_id(), self.target_address);
}
}
None => {
let new_connection = self.create_connection().await;
ConnectionState::NotConnected => {
match self.create_connection().await {
Some(new_connection) => {
*lock = ConnectionState::Connection(new_connection.clone());
self.reconnect_count.fetch_add(1, Ordering::SeqCst);
assert!(lock.is_none(), "old connection must be None");
*lock = Some(new_connection.clone());
// let old_conn = foo.replace(Some(new_connection.clone()));
debug!("Create initial connection {}", new_connection.stable_id());
new_connection
info!(
"Create initial connection {} to {}",
new_connection.stable_id(),
self.target_address
);
}
None => {
warn!(
"Initial connection to {} failed permanently",
self.target_address
);
*lock = ConnectionState::PermanentError;
}
};
}
ConnectionState::PermanentError => {
// no nothing
debug!("Not using connection to {} with permanent error", self.target_address);
}
}
}
async fn create_connection(&self) -> Connection {
async fn create_connection(&self) -> Option<Connection> {
let connection = self
.endpoint
.connect(self.target_address, "localhost")
.expect("handshake");
connection.await.expect("connection")
match connection.await {
Ok(conn) => Some(conn),
Err(ConnectionError::TimedOut) => None,
// maybe we should also treat TransportError explicitly
Err(unexpected_error) => {
panic!(
"Connection to {} failed with unexpected error: {}",
self.target_address, unexpected_error
);
}
}
}
// stable_id 140266619216912, rtt=2.156683ms,
// stats FrameStats { ACK: 3, CONNECTION_CLOSE: 0, CRYPTO: 3,
// DATA_BLOCKED: 0, DATAGRAM: 0, HANDSHAKE_DONE: 1, MAX_DATA: 0,
// MAX_STREAM_DATA: 1, MAX_STREAMS_BIDI: 0, MAX_STREAMS_UNI: 0, NEW_CONNECTION_ID: 4,
// NEW_TOKEN: 0, PATH_CHALLENGE: 0, PATH_RESPONSE: 0, PING: 0, RESET_STREAM: 0,
// RETIRE_CONNECTION_ID: 1, STREAM_DATA_BLOCKED: 0, STREAMS_BLOCKED_BIDI: 0,
// STREAMS_BLOCKED_UNI: 0, STOP_SENDING: 0, STREAM: 0 }
pub async fn connection_stats(&self) -> String {
let lock = self.current.read().await;
match &*lock {
ConnectionState::Connection(conn) => format!(
"stable_id {} stats {:?}, rtt={:?}",
conn.stable_id(),
conn.stats().frame_rx,
conn.stats().path.rtt
),
ConnectionState::NotConnected => "n/c".to_string(),
ConnectionState::PermanentError => "n/a (permanent)".to_string(),
}
}
}