Merge pull request #114 from blockworks-foundation/custom_tpu_bugfixes
Try connection 10 times, adding timeouts for connect, writeall and fi…
This commit is contained in:
commit
b0856507f1
|
@ -310,6 +310,17 @@ dependencies = [
|
||||||
"event-listener",
|
"event-listener",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "async-recursion"
|
||||||
|
version = "1.0.4"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "0e97ce7de6cf12de5d7226c73f5ba9811622f4db3a5b91b55c53e987e5f91cba"
|
||||||
|
dependencies = [
|
||||||
|
"proc-macro2 1.0.56",
|
||||||
|
"quote 1.0.26",
|
||||||
|
"syn 2.0.13",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "async-trait"
|
name = "async-trait"
|
||||||
version = "0.1.64"
|
version = "0.1.64"
|
||||||
|
@ -2247,6 +2258,7 @@ version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
"async-channel",
|
"async-channel",
|
||||||
|
"async-recursion",
|
||||||
"base64 0.21.0",
|
"base64 0.21.0",
|
||||||
"bench",
|
"bench",
|
||||||
"bincode",
|
"bincode",
|
||||||
|
|
|
@ -45,6 +45,7 @@ dotenv = "0.15.0"
|
||||||
async-channel = "1.8.0"
|
async-channel = "1.8.0"
|
||||||
quinn = "0.9.3"
|
quinn = "0.9.3"
|
||||||
rustls = { version = "0.20.6", default-features = false }
|
rustls = { version = "0.20.6", default-features = false }
|
||||||
|
async-recursion = "1.0.4"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
bench = { path = "./bench" }
|
bench = { path = "./bench" }
|
||||||
|
@ -84,3 +85,4 @@ dotenv = { workspace = true }
|
||||||
async-channel = { workspace = true }
|
async-channel = { workspace = true }
|
||||||
quinn = { workspace = true }
|
quinn = { workspace = true }
|
||||||
rustls = { workspace = true }
|
rustls = { workspace = true }
|
||||||
|
async-recursion = { workspace = true }
|
||||||
|
|
|
@ -8,12 +8,13 @@ use std::{
|
||||||
time::Duration,
|
time::Duration,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
use async_recursion::async_recursion;
|
||||||
use dashmap::DashMap;
|
use dashmap::DashMap;
|
||||||
use log::{error, info, trace, warn};
|
use log::{error, info, trace, warn};
|
||||||
use prometheus::{core::GenericGauge, opts, register_int_gauge};
|
use prometheus::{core::GenericGauge, opts, register_int_gauge};
|
||||||
use quinn::{
|
use quinn::{
|
||||||
ClientConfig, Connection, ConnectionError, Endpoint, EndpointConfig, IdleTimeout, TokioRuntime,
|
ClientConfig, Connection, ConnectionError, Endpoint, EndpointConfig, IdleTimeout, SendStream,
|
||||||
TransportConfig,
|
TokioRuntime, TransportConfig,
|
||||||
};
|
};
|
||||||
use solana_sdk::pubkey::Pubkey;
|
use solana_sdk::pubkey::Pubkey;
|
||||||
use tokio::{
|
use tokio::{
|
||||||
|
@ -24,7 +25,8 @@ use tokio::{
|
||||||
use super::rotating_queue::RotatingQueue;
|
use super::rotating_queue::RotatingQueue;
|
||||||
|
|
||||||
pub const ALPN_TPU_PROTOCOL_ID: &[u8] = b"solana-tpu";
|
pub const ALPN_TPU_PROTOCOL_ID: &[u8] = b"solana-tpu";
|
||||||
const QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC: u64 = 5;
|
const QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC: Duration = Duration::from_secs(10);
|
||||||
|
const CONNECTION_RETRY_COUNT: usize = 10;
|
||||||
|
|
||||||
lazy_static::lazy_static! {
|
lazy_static::lazy_static! {
|
||||||
static ref NB_QUIC_CONNECTIONS: GenericGauge<prometheus::core::AtomicI64> =
|
static ref NB_QUIC_CONNECTIONS: GenericGauge<prometheus::core::AtomicI64> =
|
||||||
|
@ -54,13 +56,7 @@ impl ActiveConnection {
|
||||||
|
|
||||||
async fn make_connection(endpoint: Endpoint, addr: SocketAddr) -> anyhow::Result<Connection> {
|
async fn make_connection(endpoint: Endpoint, addr: SocketAddr) -> anyhow::Result<Connection> {
|
||||||
let connecting = endpoint.connect(addr, "connect")?;
|
let connecting = endpoint.connect(addr, "connect")?;
|
||||||
|
let res = timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, connecting).await?;
|
||||||
// let res = timeout(
|
|
||||||
// Duration::from_secs(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC),
|
|
||||||
// connecting,
|
|
||||||
// )
|
|
||||||
// .await?;
|
|
||||||
let res = connecting.await;
|
|
||||||
Ok(res.unwrap())
|
Ok(res.unwrap())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -71,23 +67,15 @@ impl ActiveConnection {
|
||||||
let connecting = endpoint.connect(addr, "connect")?;
|
let connecting = endpoint.connect(addr, "connect")?;
|
||||||
let connection = match connecting.into_0rtt() {
|
let connection = match connecting.into_0rtt() {
|
||||||
Ok((connection, zero_rtt)) => {
|
Ok((connection, zero_rtt)) => {
|
||||||
if let Ok(_) = timeout(
|
if let Ok(_) = timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, zero_rtt).await {
|
||||||
Duration::from_secs(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC),
|
|
||||||
zero_rtt,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
|
||||||
connection
|
connection
|
||||||
} else {
|
} else {
|
||||||
return Err(ConnectionError::TimedOut.into());
|
return Err(ConnectionError::TimedOut.into());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(connecting) => {
|
Err(connecting) => {
|
||||||
if let Ok(connecting_result) = timeout(
|
if let Ok(connecting_result) =
|
||||||
Duration::from_millis(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC),
|
timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, connecting).await
|
||||||
connecting,
|
|
||||||
)
|
|
||||||
.await
|
|
||||||
{
|
{
|
||||||
connecting_result?
|
connecting_result?
|
||||||
} else {
|
} else {
|
||||||
|
@ -98,6 +86,98 @@ impl ActiveConnection {
|
||||||
Ok(connection)
|
Ok(connection)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async fn connect(
|
||||||
|
identity: Pubkey,
|
||||||
|
already_connected: bool,
|
||||||
|
endpoint: Endpoint,
|
||||||
|
addr: SocketAddr,
|
||||||
|
exit_signal: Arc<AtomicBool>,
|
||||||
|
) -> Option<Arc<Connection>> {
|
||||||
|
for _i in 0..CONNECTION_RETRY_COUNT {
|
||||||
|
let conn = if already_connected {
|
||||||
|
info!("making make_connection_0rtt");
|
||||||
|
Self::make_connection_0rtt(endpoint.clone(), addr.clone()).await
|
||||||
|
} else {
|
||||||
|
info!("making make_connection");
|
||||||
|
Self::make_connection(endpoint.clone(), addr.clone()).await
|
||||||
|
};
|
||||||
|
match conn {
|
||||||
|
Ok(conn) => {
|
||||||
|
NB_QUIC_CONNECTIONS.inc();
|
||||||
|
return Some(Arc::new(conn));
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
trace!("Could not connect to {} because of error {}", identity, e);
|
||||||
|
if exit_signal.load(Ordering::Relaxed) {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None
|
||||||
|
}
|
||||||
|
|
||||||
|
#[async_recursion]
|
||||||
|
async fn open_unistream(
|
||||||
|
connection: &mut Option<Arc<Connection>>,
|
||||||
|
reconnect: bool,
|
||||||
|
identity: Pubkey,
|
||||||
|
already_connected: bool,
|
||||||
|
endpoint: Endpoint,
|
||||||
|
addr: SocketAddr,
|
||||||
|
exit_signal: Arc<AtomicBool>,
|
||||||
|
) -> Option<SendStream> {
|
||||||
|
let (unistream, reconnect_and_try_again) = match connection {
|
||||||
|
Some(conn) => {
|
||||||
|
let unistream_maybe_timeout =
|
||||||
|
timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, conn.open_uni()).await;
|
||||||
|
match unistream_maybe_timeout {
|
||||||
|
Ok(unistream_res) => match unistream_res {
|
||||||
|
Ok(unistream) => (Some(unistream), false),
|
||||||
|
Err(_) => (None, reconnect),
|
||||||
|
},
|
||||||
|
Err(_) => {
|
||||||
|
// timed out
|
||||||
|
(None, false)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
None => (None, true),
|
||||||
|
};
|
||||||
|
|
||||||
|
if reconnect_and_try_again {
|
||||||
|
let conn = Self::connect(
|
||||||
|
identity,
|
||||||
|
already_connected,
|
||||||
|
endpoint.clone(),
|
||||||
|
addr.clone(),
|
||||||
|
exit_signal.clone(),
|
||||||
|
)
|
||||||
|
.await;
|
||||||
|
match conn {
|
||||||
|
Some(conn) => {
|
||||||
|
*connection = Some(conn);
|
||||||
|
Self::open_unistream(
|
||||||
|
connection,
|
||||||
|
false,
|
||||||
|
identity,
|
||||||
|
already_connected,
|
||||||
|
endpoint,
|
||||||
|
addr,
|
||||||
|
exit_signal,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
// connection with the peer is not possible
|
||||||
|
None
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
unistream
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn listen(
|
async fn listen(
|
||||||
transaction_reciever: Receiver<Vec<u8>>,
|
transaction_reciever: Receiver<Vec<u8>>,
|
||||||
exit_oneshot_channel: tokio::sync::mpsc::Receiver<()>,
|
exit_oneshot_channel: tokio::sync::mpsc::Receiver<()>,
|
||||||
|
@ -108,9 +188,10 @@ impl ActiveConnection {
|
||||||
) {
|
) {
|
||||||
NB_QUIC_TASKS.inc();
|
NB_QUIC_TASKS.inc();
|
||||||
let mut already_connected = false;
|
let mut already_connected = false;
|
||||||
let mut connection: Option<Connection> = None;
|
let mut connection: Option<Arc<Connection>> = None;
|
||||||
let mut transaction_reciever = transaction_reciever;
|
let mut transaction_reciever = transaction_reciever;
|
||||||
let mut exit_oneshot_channel = exit_oneshot_channel;
|
let mut exit_oneshot_channel = exit_oneshot_channel;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
// exit signal set
|
// exit signal set
|
||||||
if exit_signal.load(Ordering::Relaxed) {
|
if exit_signal.load(Ordering::Relaxed) {
|
||||||
|
@ -118,7 +199,7 @@ impl ActiveConnection {
|
||||||
}
|
}
|
||||||
|
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
tx_or_timeout = timeout(Duration::from_secs(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC), transaction_reciever.recv() ) => {
|
tx_or_timeout = timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, transaction_reciever.recv() ) => {
|
||||||
// exit signal set
|
// exit signal set
|
||||||
if exit_signal.load(Ordering::Relaxed) {
|
if exit_signal.load(Ordering::Relaxed) {
|
||||||
break;
|
break;
|
||||||
|
@ -136,77 +217,66 @@ impl ActiveConnection {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
let mut send_stream = match &connection {
|
let unistream = Self::open_unistream(
|
||||||
Some(conn) => {
|
&mut connection,
|
||||||
let unistream = conn.open_uni().await;
|
true,
|
||||||
if let Err(_) = unistream {
|
identity.clone(),
|
||||||
// reconnect as connection is closed by the server and then retry
|
already_connected,
|
||||||
let conn = Self::make_connection_0rtt(endpoint.clone(), addr.clone()).await;
|
endpoint.clone(),
|
||||||
match conn {
|
addr.clone(),
|
||||||
Ok(conn) => {
|
exit_signal.clone(),
|
||||||
let unistream = conn.open_uni().await;
|
).await;
|
||||||
connection = Some(conn);
|
|
||||||
match unistream{
|
if !already_connected && connection.is_some() {
|
||||||
Ok(stream) => stream,
|
already_connected = true;
|
||||||
Err(e) => {
|
|
||||||
warn!("error opening a unistream for {} error {}", identity, e);
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
match unistream {
|
||||||
|
Some(mut send_stream) => {
|
||||||
|
trace!("Sending {} transaction", identity);
|
||||||
|
let write_timeout_res = timeout( QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, send_stream.write_all(tx.as_slice())).await;
|
||||||
|
match write_timeout_res {
|
||||||
|
Ok(write_res) => {
|
||||||
|
if let Err(e) = write_res {
|
||||||
|
warn!(
|
||||||
|
"Error while writing transaction for {}, error {}",
|
||||||
|
identity,
|
||||||
|
e
|
||||||
|
);
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
Err(e) => {
|
Err(_) => {
|
||||||
// validator no longer accepting connection
|
warn!(
|
||||||
trace!("Could not reconnect to {} because of error {}", identity, e);
|
"timeout while writing transaction for {}",
|
||||||
break;
|
identity
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
unistream.unwrap()
|
let finish_timeout_res = timeout(QUIC_CONNECTION_TIMEOUT_DURATION_IN_SEC, send_stream.finish()).await;
|
||||||
|
match finish_timeout_res {
|
||||||
|
Ok(finish_res) => {
|
||||||
|
if let Err(e) = finish_res {
|
||||||
|
warn!(
|
||||||
|
"Error while writing transaction for {}, error {}",
|
||||||
|
identity,
|
||||||
|
e
|
||||||
|
);
|
||||||
|
}
|
||||||
|
},
|
||||||
|
Err(_) => {
|
||||||
|
warn!(
|
||||||
|
"timeout while writing transaction for {}",
|
||||||
|
identity
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
None => {
|
None => {
|
||||||
let conn = if already_connected {
|
trace!("could not create a unistream for {}", identity);
|
||||||
info!("making make_connection_0rtt");
|
|
||||||
Self::make_connection_0rtt(endpoint.clone(), addr.clone()).await
|
|
||||||
} else {
|
|
||||||
info!("making make_connection");
|
|
||||||
Self::make_connection(endpoint.clone(), addr.clone()).await
|
|
||||||
};
|
|
||||||
match conn {
|
|
||||||
Ok(conn) => {
|
|
||||||
NB_QUIC_CONNECTIONS.inc();
|
|
||||||
already_connected = true;
|
|
||||||
let unistream = conn.open_uni().await;
|
|
||||||
if let Err(e) = unistream {
|
|
||||||
warn!("error opening a unistream for {} error {}", identity, e);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
connection = Some(conn);
|
|
||||||
unistream.unwrap()
|
|
||||||
},
|
|
||||||
Err(e) => {
|
|
||||||
warn!("Could not connect to {} because of error {}", identity, e);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
trace!("Sending {} transaction", identity);
|
|
||||||
if let Err(e) = send_stream.write_all(tx.as_slice()).await {
|
|
||||||
warn!(
|
|
||||||
"Error while writing transaction for {} error {}",
|
|
||||||
identity, e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
if let Err(e) = send_stream.finish().await {
|
|
||||||
warn!(
|
|
||||||
"Error finishing for {}, error {}",
|
|
||||||
identity, e,
|
|
||||||
)
|
|
||||||
}
|
|
||||||
},
|
},
|
||||||
Err(_) => {
|
Err(_) => {
|
||||||
// timed out
|
// timed out
|
||||||
|
|
Loading…
Reference in New Issue