Connection pool support in connection cache and QUIC connection reliability improvement (#25793)
* Connection pool in connection cache and handle connection errors 1. The connection not has a pool of connections per address, configurable, default 4 2. The connections per address share a lazy initialized endpoint 3. Handle connection issues better, avoid race conditions 4. Various log improvement for help debug connection issues
This commit is contained in:
parent
bea35d5fbe
commit
29b597cea5
|
@ -5,7 +5,7 @@ use {
|
|||
log::*,
|
||||
rand::{thread_rng, Rng},
|
||||
rayon::prelude::*,
|
||||
solana_client::connection_cache::ConnectionCache,
|
||||
solana_client::connection_cache::{ConnectionCache, DEFAULT_TPU_CONNECTION_POOL_SIZE},
|
||||
solana_core::banking_stage::BankingStage,
|
||||
solana_gossip::cluster_info::{ClusterInfo, Node},
|
||||
solana_ledger::{
|
||||
|
@ -352,7 +352,10 @@ fn main() {
|
|||
None,
|
||||
replay_vote_sender,
|
||||
Arc::new(RwLock::new(CostModel::default())),
|
||||
Arc::new(ConnectionCache::new(tpu_use_quic)),
|
||||
Arc::new(ConnectionCache::new(
|
||||
tpu_use_quic,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
)),
|
||||
);
|
||||
poh_recorder.lock().unwrap().set_bank(&bank);
|
||||
|
||||
|
|
|
@ -2,7 +2,7 @@ use {
|
|||
clap::{crate_description, crate_name, App, Arg, ArgMatches},
|
||||
solana_clap_utils::input_validators::{is_url, is_url_or_moniker},
|
||||
solana_cli_config::{ConfigInput, CONFIG_FILE},
|
||||
solana_client::connection_cache::DEFAULT_TPU_USE_QUIC,
|
||||
solana_client::connection_cache::{DEFAULT_TPU_CONNECTION_POOL_SIZE, DEFAULT_TPU_USE_QUIC},
|
||||
solana_sdk::{
|
||||
fee_calculator::FeeRateGovernor,
|
||||
pubkey::Pubkey,
|
||||
|
@ -53,6 +53,7 @@ pub struct Config {
|
|||
pub target_node: Option<Pubkey>,
|
||||
pub external_client_type: ExternalClientType,
|
||||
pub use_quic: bool,
|
||||
pub tpu_connection_pool_size: usize,
|
||||
}
|
||||
|
||||
impl Default for Config {
|
||||
|
@ -79,6 +80,7 @@ impl Default for Config {
|
|||
target_node: None,
|
||||
external_client_type: ExternalClientType::default(),
|
||||
use_quic: DEFAULT_TPU_USE_QUIC,
|
||||
tpu_connection_pool_size: DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -294,6 +296,13 @@ pub fn build_args<'a, 'b>(version: &'b str) -> App<'a, 'b> {
|
|||
.help("Submit transactions via QUIC; only affects ThinClient (default) \
|
||||
or TpuClient sends"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("tpu_connection_pool_size")
|
||||
.long("tpu-connection-pool-size")
|
||||
.takes_value(true)
|
||||
.help("Controls the connection pool size per remote address; only affects ThinClient (default) \
|
||||
or TpuClient sends"),
|
||||
)
|
||||
}
|
||||
|
||||
/// Parses a clap `ArgMatches` structure into a `Config`
|
||||
|
@ -343,6 +352,13 @@ pub fn extract_args(matches: &ArgMatches) -> Config {
|
|||
args.use_quic = true;
|
||||
}
|
||||
|
||||
if let Some(v) = matches.value_of("tpu_connection_pool_size") {
|
||||
args.tpu_connection_pool_size = v
|
||||
.to_string()
|
||||
.parse()
|
||||
.expect("can't parse tpu_connection_pool_size");
|
||||
}
|
||||
|
||||
if let Some(addr) = matches.value_of("entrypoint") {
|
||||
args.entrypoint_addr = solana_net_utils::parse_host_port(addr).unwrap_or_else(|e| {
|
||||
eprintln!("failed to parse entrypoint address: {}", e);
|
||||
|
|
|
@ -49,6 +49,7 @@ fn main() {
|
|||
target_node,
|
||||
external_client_type,
|
||||
use_quic,
|
||||
tpu_connection_pool_size,
|
||||
..
|
||||
} = &cli_config;
|
||||
|
||||
|
@ -102,7 +103,9 @@ fn main() {
|
|||
do_bench_tps(client, cli_config, keypairs);
|
||||
}
|
||||
ExternalClientType::ThinClient => {
|
||||
let connection_cache = Arc::new(ConnectionCache::new(*use_quic));
|
||||
let connection_cache =
|
||||
Arc::new(ConnectionCache::new(*use_quic, *tpu_connection_pool_size));
|
||||
|
||||
let client = if let Ok(rpc_addr) = value_t!(matches, "rpc_addr", String) {
|
||||
let rpc = rpc_addr.parse().unwrap_or_else(|e| {
|
||||
eprintln!("RPC address should parse as socketaddr {:?}", e);
|
||||
|
@ -172,7 +175,9 @@ fn main() {
|
|||
json_rpc_url.to_string(),
|
||||
CommitmentConfig::confirmed(),
|
||||
));
|
||||
let connection_cache = Arc::new(ConnectionCache::new(*use_quic));
|
||||
let connection_cache =
|
||||
Arc::new(ConnectionCache::new(*use_quic, *tpu_connection_pool_size));
|
||||
|
||||
let client = Arc::new(
|
||||
TpuClient::new_with_connection_cache(
|
||||
rpc_client,
|
||||
|
|
|
@ -1,17 +1,18 @@
|
|||
use {
|
||||
crate::{
|
||||
nonblocking::quic_client::QuicLazyInitializedEndpoint,
|
||||
quic_client::QuicTpuConnection,
|
||||
tpu_connection::{ClientStats, Connection},
|
||||
udp_client::UdpTpuConnection,
|
||||
},
|
||||
indexmap::map::IndexMap,
|
||||
indexmap::map::{Entry, IndexMap},
|
||||
rand::{thread_rng, Rng},
|
||||
solana_measure::measure::Measure,
|
||||
solana_sdk::timing::AtomicInterval,
|
||||
solana_sdk::{quic::QUIC_PORT_OFFSET, timing::AtomicInterval},
|
||||
std::{
|
||||
net::SocketAddr,
|
||||
sync::{
|
||||
atomic::{AtomicBool, AtomicU64, Ordering},
|
||||
atomic::{AtomicU64, Ordering},
|
||||
Arc, RwLock,
|
||||
},
|
||||
},
|
||||
|
@ -24,6 +25,9 @@ static MAX_CONNECTIONS: usize = 1024;
|
|||
/// QUIC connections.
|
||||
pub const DEFAULT_TPU_USE_QUIC: bool = false;
|
||||
|
||||
/// Default TPU connection pool size per remote address
|
||||
pub const DEFAULT_TPU_CONNECTION_POOL_SIZE: usize = 4;
|
||||
|
||||
#[derive(Default)]
|
||||
pub struct ConnectionCacheStats {
|
||||
cache_hits: AtomicU64,
|
||||
|
@ -214,22 +218,144 @@ impl ConnectionCacheStats {
|
|||
}
|
||||
|
||||
pub struct ConnectionCache {
|
||||
map: RwLock<IndexMap<SocketAddr, Arc<Connection>>>,
|
||||
map: RwLock<IndexMap<SocketAddr, ConnectionPool>>,
|
||||
stats: Arc<ConnectionCacheStats>,
|
||||
last_stats: AtomicInterval,
|
||||
use_quic: AtomicBool,
|
||||
use_quic: bool,
|
||||
connection_pool_size: usize,
|
||||
}
|
||||
|
||||
/// Models the pool of connections
|
||||
struct ConnectionPool {
|
||||
/// The connections in the pool
|
||||
connections: Vec<Arc<Connection>>,
|
||||
|
||||
/// Connections in this pool share the same endpoint
|
||||
endpoint: Option<Arc<QuicLazyInitializedEndpoint>>,
|
||||
}
|
||||
|
||||
impl ConnectionPool {
|
||||
/// Get a connection from the pool. It must have at least one connection in the pool.
|
||||
/// This randomly picks a connection in the pool.
|
||||
fn borrow_connection(&self) -> Arc<Connection> {
|
||||
let mut rng = thread_rng();
|
||||
let n = rng.gen_range(0, self.connections.len());
|
||||
self.connections[n].clone()
|
||||
}
|
||||
|
||||
/// Check if we need to create a new connection. If the count of the connections
|
||||
/// is smaller than the pool size.
|
||||
fn need_new_connection(&self, required_pool_size: usize) -> bool {
|
||||
self.connections.len() < required_pool_size
|
||||
}
|
||||
}
|
||||
|
||||
impl ConnectionCache {
|
||||
pub fn new(use_quic: bool) -> Self {
|
||||
pub fn new(use_quic: bool, connection_pool_size: usize) -> Self {
|
||||
// The minimum pool size is 1.
|
||||
let connection_pool_size = 1.max(connection_pool_size);
|
||||
Self {
|
||||
use_quic: AtomicBool::new(use_quic),
|
||||
use_quic,
|
||||
connection_pool_size,
|
||||
..Self::default()
|
||||
}
|
||||
}
|
||||
|
||||
pub fn get_use_quic(&self) -> bool {
|
||||
self.use_quic.load(Ordering::Relaxed)
|
||||
self.use_quic
|
||||
}
|
||||
|
||||
fn create_endpoint(&self) -> Option<Arc<QuicLazyInitializedEndpoint>> {
|
||||
if self.use_quic {
|
||||
Some(Arc::new(QuicLazyInitializedEndpoint::new()))
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
/// Create a lazy connection object under the exclusive lock of the cache map if there is not
|
||||
/// enough unsed connections in the connection pool for the specified address.
|
||||
/// Returns CreateConnectionResult.
|
||||
fn create_connection(
|
||||
&self,
|
||||
lock_timing_ms: &mut u64,
|
||||
addr: &SocketAddr,
|
||||
) -> CreateConnectionResult {
|
||||
let mut get_connection_map_lock_measure = Measure::start("get_connection_map_lock_measure");
|
||||
let mut map = self.map.write().unwrap();
|
||||
get_connection_map_lock_measure.stop();
|
||||
*lock_timing_ms = lock_timing_ms.saturating_add(get_connection_map_lock_measure.as_ms());
|
||||
// Read again, as it is possible that between read lock dropped and the write lock acquired
|
||||
// another thread could have setup the connection.
|
||||
|
||||
let (to_create_connection, endpoint) =
|
||||
map.get(addr)
|
||||
.map_or((true, self.create_endpoint()), |pool| {
|
||||
(
|
||||
pool.need_new_connection(self.connection_pool_size),
|
||||
pool.endpoint.clone(),
|
||||
)
|
||||
});
|
||||
|
||||
let (cache_hit, connection_cache_stats, num_evictions, eviction_timing_ms) =
|
||||
if to_create_connection {
|
||||
let connection: Connection = if self.use_quic {
|
||||
QuicTpuConnection::new(
|
||||
endpoint.as_ref().unwrap().clone(),
|
||||
*addr,
|
||||
self.stats.clone(),
|
||||
)
|
||||
.into()
|
||||
} else {
|
||||
UdpTpuConnection::new(*addr, self.stats.clone()).into()
|
||||
};
|
||||
|
||||
let connection = Arc::new(connection);
|
||||
|
||||
// evict a connection if the cache is reaching upper bounds
|
||||
let mut num_evictions = 0;
|
||||
let mut get_connection_cache_eviction_measure =
|
||||
Measure::start("get_connection_cache_eviction_measure");
|
||||
while map.len() >= MAX_CONNECTIONS {
|
||||
let mut rng = thread_rng();
|
||||
let n = rng.gen_range(0, MAX_CONNECTIONS);
|
||||
map.swap_remove_index(n);
|
||||
num_evictions += 1;
|
||||
}
|
||||
get_connection_cache_eviction_measure.stop();
|
||||
|
||||
match map.entry(*addr) {
|
||||
Entry::Occupied(mut entry) => {
|
||||
let pool = entry.get_mut();
|
||||
pool.connections.push(connection);
|
||||
}
|
||||
Entry::Vacant(entry) => {
|
||||
entry.insert(ConnectionPool {
|
||||
connections: vec![connection],
|
||||
endpoint,
|
||||
});
|
||||
}
|
||||
}
|
||||
(
|
||||
false,
|
||||
self.stats.clone(),
|
||||
num_evictions,
|
||||
get_connection_cache_eviction_measure.as_ms(),
|
||||
)
|
||||
} else {
|
||||
(true, self.stats.clone(), 0, 0)
|
||||
};
|
||||
|
||||
let pool = map.get(addr).unwrap();
|
||||
let connection = pool.borrow_connection();
|
||||
|
||||
CreateConnectionResult {
|
||||
connection,
|
||||
cache_hit,
|
||||
connection_cache_stats,
|
||||
num_evictions,
|
||||
eviction_timing_ms,
|
||||
}
|
||||
}
|
||||
|
||||
fn get_or_add_connection(&self, addr: &SocketAddr) -> GetConnectionResult {
|
||||
|
@ -237,6 +363,10 @@ impl ConnectionCache {
|
|||
let map = self.map.read().unwrap();
|
||||
get_connection_map_lock_measure.stop();
|
||||
|
||||
let port_offset = if self.use_quic { QUIC_PORT_OFFSET } else { 0 };
|
||||
|
||||
let addr = SocketAddr::new(addr.ip(), addr.port() + port_offset);
|
||||
|
||||
let mut lock_timing_ms = get_connection_map_lock_measure.as_ms();
|
||||
|
||||
let report_stats = self
|
||||
|
@ -244,57 +374,35 @@ impl ConnectionCache {
|
|||
.should_update(CONNECTION_STAT_SUBMISSION_INTERVAL);
|
||||
|
||||
let mut get_connection_map_measure = Measure::start("get_connection_hit_measure");
|
||||
let (connection, cache_hit, connection_cache_stats, num_evictions, eviction_timing_ms) =
|
||||
match map.get(addr) {
|
||||
Some(connection) => (connection.clone(), true, self.stats.clone(), 0, 0),
|
||||
None => {
|
||||
// Upgrade to write access by dropping read lock and acquire write lock
|
||||
let CreateConnectionResult {
|
||||
connection,
|
||||
cache_hit,
|
||||
connection_cache_stats,
|
||||
num_evictions,
|
||||
eviction_timing_ms,
|
||||
} = match map.get(&addr) {
|
||||
Some(pool) => {
|
||||
if pool.need_new_connection(self.connection_pool_size) {
|
||||
// create more connection and put it in the pool
|
||||
drop(map);
|
||||
let mut get_connection_map_lock_measure =
|
||||
Measure::start("get_connection_map_lock_measure");
|
||||
let mut map = self.map.write().unwrap();
|
||||
get_connection_map_lock_measure.stop();
|
||||
|
||||
lock_timing_ms =
|
||||
lock_timing_ms.saturating_add(get_connection_map_lock_measure.as_ms());
|
||||
|
||||
// Read again, as it is possible that between read lock dropped and the write lock acquired
|
||||
// another thread could have setup the connection.
|
||||
match map.get(addr) {
|
||||
Some(connection) => (connection.clone(), true, self.stats.clone(), 0, 0),
|
||||
None => {
|
||||
let connection: Connection = if self.use_quic.load(Ordering::Relaxed) {
|
||||
QuicTpuConnection::new(*addr, self.stats.clone()).into()
|
||||
} else {
|
||||
UdpTpuConnection::new(*addr, self.stats.clone()).into()
|
||||
};
|
||||
|
||||
let connection = Arc::new(connection);
|
||||
|
||||
// evict a connection if the cache is reaching upper bounds
|
||||
let mut num_evictions = 0;
|
||||
let mut get_connection_cache_eviction_measure =
|
||||
Measure::start("get_connection_cache_eviction_measure");
|
||||
while map.len() >= MAX_CONNECTIONS {
|
||||
let mut rng = thread_rng();
|
||||
let n = rng.gen_range(0, MAX_CONNECTIONS);
|
||||
map.swap_remove_index(n);
|
||||
num_evictions += 1;
|
||||
}
|
||||
get_connection_cache_eviction_measure.stop();
|
||||
|
||||
map.insert(*addr, connection.clone());
|
||||
(
|
||||
connection,
|
||||
false,
|
||||
self.stats.clone(),
|
||||
num_evictions,
|
||||
get_connection_cache_eviction_measure.as_ms(),
|
||||
)
|
||||
}
|
||||
self.create_connection(&mut lock_timing_ms, &addr)
|
||||
} else {
|
||||
let connection = pool.borrow_connection();
|
||||
CreateConnectionResult {
|
||||
connection,
|
||||
cache_hit: true,
|
||||
connection_cache_stats: self.stats.clone(),
|
||||
num_evictions: 0,
|
||||
eviction_timing_ms: 0,
|
||||
}
|
||||
}
|
||||
};
|
||||
}
|
||||
None => {
|
||||
// Upgrade to write access by dropping read lock and acquire write lock
|
||||
drop(map);
|
||||
self.create_connection(&mut lock_timing_ms, &addr)
|
||||
}
|
||||
};
|
||||
get_connection_map_measure.stop();
|
||||
|
||||
GetConnectionResult {
|
||||
|
@ -359,13 +467,15 @@ impl ConnectionCache {
|
|||
connection
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for ConnectionCache {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
map: RwLock::new(IndexMap::with_capacity(MAX_CONNECTIONS)),
|
||||
stats: Arc::new(ConnectionCacheStats::default()),
|
||||
last_stats: AtomicInterval::default(),
|
||||
use_quic: AtomicBool::new(DEFAULT_TPU_USE_QUIC),
|
||||
use_quic: DEFAULT_TPU_USE_QUIC,
|
||||
connection_pool_size: DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -381,6 +491,14 @@ struct GetConnectionResult {
|
|||
eviction_timing_ms: u64,
|
||||
}
|
||||
|
||||
struct CreateConnectionResult {
|
||||
connection: Arc<Connection>,
|
||||
cache_hit: bool,
|
||||
connection_cache_stats: Arc<ConnectionCacheStats>,
|
||||
num_evictions: u64,
|
||||
eviction_timing_ms: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use {
|
||||
|
@ -432,7 +550,7 @@ mod tests {
|
|||
let map = connection_cache.map.read().unwrap();
|
||||
assert!(map.len() == MAX_CONNECTIONS);
|
||||
addrs.iter().for_each(|a| {
|
||||
let conn = map.get(a).expect("Address not found");
|
||||
let conn = &map.get(a).expect("Address not found").connections[0];
|
||||
assert!(a.ip() == conn.tpu_addr().ip());
|
||||
});
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ use {
|
|||
async_mutex::Mutex,
|
||||
futures::future::join_all,
|
||||
itertools::Itertools,
|
||||
log::*,
|
||||
quinn::{
|
||||
ClientConfig, Endpoint, EndpointConfig, IdleTimeout, NewConnection, VarInt, WriteError,
|
||||
},
|
||||
|
@ -18,8 +19,10 @@ use {
|
|||
std::{
|
||||
net::{IpAddr, Ipv4Addr, SocketAddr, UdpSocket},
|
||||
sync::{atomic::Ordering, Arc},
|
||||
thread,
|
||||
time::Duration,
|
||||
},
|
||||
tokio::sync::RwLock,
|
||||
};
|
||||
|
||||
struct SkipServerVerification;
|
||||
|
@ -44,18 +47,19 @@ impl rustls::client::ServerCertVerifier for SkipServerVerification {
|
|||
}
|
||||
}
|
||||
|
||||
/// A wrapper over NewConnection with additional capability to create the endpoint as part
|
||||
/// of creating a new connection.
|
||||
#[derive(Clone)]
|
||||
struct QuicNewConnection {
|
||||
endpoint: Endpoint,
|
||||
connection: Arc<NewConnection>,
|
||||
/// A lazy-initialized Quic Endpoint
|
||||
pub struct QuicLazyInitializedEndpoint {
|
||||
endpoint: RwLock<Option<Arc<Endpoint>>>,
|
||||
}
|
||||
|
||||
impl QuicNewConnection {
|
||||
/// Create a QuicNewConnection given the remote address 'addr'.
|
||||
async fn make_connection(addr: SocketAddr, stats: &ClientStats) -> Result<Self, WriteError> {
|
||||
let mut make_connection_measure = Measure::start("make_connection_measure");
|
||||
impl QuicLazyInitializedEndpoint {
|
||||
pub fn new() -> Self {
|
||||
Self {
|
||||
endpoint: RwLock::new(None),
|
||||
}
|
||||
}
|
||||
|
||||
fn create_endpoint() -> Endpoint {
|
||||
let (_, client_socket) = solana_net_utils::bind_in_range(
|
||||
IpAddr::V4(Ipv4Addr::new(0, 0, 0, 0)),
|
||||
VALIDATOR_PORT_RANGE,
|
||||
|
@ -78,6 +82,56 @@ impl QuicNewConnection {
|
|||
transport_config.keep_alive_interval(Some(Duration::from_millis(QUIC_KEEP_ALIVE_MS)));
|
||||
|
||||
endpoint.set_default_client_config(config);
|
||||
endpoint
|
||||
}
|
||||
|
||||
async fn get_endpoint(&self) -> Arc<Endpoint> {
|
||||
let lock = self.endpoint.read().await;
|
||||
let endpoint = lock.as_ref();
|
||||
|
||||
match endpoint {
|
||||
Some(endpoint) => endpoint.clone(),
|
||||
None => {
|
||||
drop(lock);
|
||||
let mut lock = self.endpoint.write().await;
|
||||
let endpoint = lock.as_ref();
|
||||
|
||||
match endpoint {
|
||||
Some(endpoint) => endpoint.clone(),
|
||||
None => {
|
||||
let connection = Arc::new(Self::create_endpoint());
|
||||
*lock = Some(connection.clone());
|
||||
connection
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl Default for QuicLazyInitializedEndpoint {
|
||||
fn default() -> Self {
|
||||
Self::new()
|
||||
}
|
||||
}
|
||||
|
||||
/// A wrapper over NewConnection with additional capability to create the endpoint as part
|
||||
/// of creating a new connection.
|
||||
#[derive(Clone)]
|
||||
struct QuicNewConnection {
|
||||
endpoint: Arc<Endpoint>,
|
||||
connection: Arc<NewConnection>,
|
||||
}
|
||||
|
||||
impl QuicNewConnection {
|
||||
/// Create a QuicNewConnection given the remote address 'addr'.
|
||||
async fn make_connection(
|
||||
endpoint: Arc<QuicLazyInitializedEndpoint>,
|
||||
addr: SocketAddr,
|
||||
stats: &ClientStats,
|
||||
) -> Result<Self, WriteError> {
|
||||
let mut make_connection_measure = Measure::start("make_connection_measure");
|
||||
let endpoint = endpoint.get_endpoint().await;
|
||||
|
||||
let connecting = endpoint.connect(addr, "connect").unwrap();
|
||||
stats.total_connections.fetch_add(1, Ordering::Relaxed);
|
||||
|
@ -132,14 +186,16 @@ impl QuicNewConnection {
|
|||
}
|
||||
|
||||
pub struct QuicClient {
|
||||
endpoint: Arc<QuicLazyInitializedEndpoint>,
|
||||
connection: Arc<Mutex<Option<QuicNewConnection>>>,
|
||||
addr: SocketAddr,
|
||||
stats: Arc<ClientStats>,
|
||||
}
|
||||
|
||||
impl QuicClient {
|
||||
pub fn new(addr: SocketAddr) -> Self {
|
||||
pub fn new(endpoint: Arc<QuicLazyInitializedEndpoint>, addr: SocketAddr) -> Self {
|
||||
Self {
|
||||
endpoint,
|
||||
connection: Arc::new(Mutex::new(None)),
|
||||
addr,
|
||||
stats: Arc::new(ClientStats::default()),
|
||||
|
@ -165,63 +221,131 @@ impl QuicClient {
|
|||
stats: &ClientStats,
|
||||
connection_stats: Arc<ConnectionCacheStats>,
|
||||
) -> Result<Arc<NewConnection>, WriteError> {
|
||||
let connection = {
|
||||
let mut conn_guard = self.connection.lock().await;
|
||||
let mut connection_try_count = 0;
|
||||
let mut last_connection_id = 0;
|
||||
let mut last_error = None;
|
||||
|
||||
let maybe_conn = conn_guard.clone();
|
||||
match maybe_conn {
|
||||
Some(conn) => {
|
||||
stats.connection_reuse.fetch_add(1, Ordering::Relaxed);
|
||||
conn.connection.clone()
|
||||
while connection_try_count < 2 {
|
||||
let connection = {
|
||||
let mut conn_guard = self.connection.lock().await;
|
||||
|
||||
let maybe_conn = conn_guard.as_mut();
|
||||
match maybe_conn {
|
||||
Some(conn) => {
|
||||
if conn.connection.connection.stable_id() == last_connection_id {
|
||||
// this is the problematic connection we had used before, create a new one
|
||||
let conn = conn.make_connection_0rtt(self.addr, stats).await;
|
||||
match conn {
|
||||
Ok(conn) => {
|
||||
info!(
|
||||
"Made 0rtt connection to {} with id {} try_count {}, last_connection_id: {}, last_error: {:?}",
|
||||
self.addr,
|
||||
conn.connection.stable_id(),
|
||||
connection_try_count,
|
||||
last_connection_id,
|
||||
last_error,
|
||||
);
|
||||
connection_try_count += 1;
|
||||
conn
|
||||
}
|
||||
Err(err) => {
|
||||
info!(
|
||||
"Cannot make 0rtt connection to {}, error {:}",
|
||||
self.addr, err
|
||||
);
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
stats.connection_reuse.fetch_add(1, Ordering::Relaxed);
|
||||
conn.connection.clone()
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let conn = QuicNewConnection::make_connection(
|
||||
self.endpoint.clone(),
|
||||
self.addr,
|
||||
stats,
|
||||
)
|
||||
.await;
|
||||
match conn {
|
||||
Ok(conn) => {
|
||||
*conn_guard = Some(conn.clone());
|
||||
info!(
|
||||
"Made connection to {} id {} try_count {}",
|
||||
self.addr,
|
||||
conn.connection.connection.stable_id(),
|
||||
connection_try_count
|
||||
);
|
||||
connection_try_count += 1;
|
||||
conn.connection.clone()
|
||||
}
|
||||
Err(err) => {
|
||||
info!("Cannot make connection to {}, error {:}", self.addr, err);
|
||||
return Err(err);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
None => {
|
||||
let conn = QuicNewConnection::make_connection(self.addr, stats).await?;
|
||||
*conn_guard = Some(conn.clone());
|
||||
conn.connection.clone()
|
||||
};
|
||||
|
||||
let new_stats = connection.connection.stats();
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.congestion_events
|
||||
.update_stat(
|
||||
&self.stats.congestion_events,
|
||||
new_stats.path.congestion_events,
|
||||
);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_streams_blocked_uni
|
||||
.update_stat(
|
||||
&self.stats.tx_streams_blocked_uni,
|
||||
new_stats.frame_tx.streams_blocked_uni,
|
||||
);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_data_blocked
|
||||
.update_stat(&self.stats.tx_data_blocked, new_stats.frame_tx.data_blocked);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_acks
|
||||
.update_stat(&self.stats.tx_acks, new_stats.frame_tx.acks);
|
||||
|
||||
last_connection_id = connection.connection.stable_id();
|
||||
match Self::_send_buffer_using_conn(data, &connection).await {
|
||||
Ok(()) => {
|
||||
return Ok(connection);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let new_stats = connection.connection.stats();
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.congestion_events
|
||||
.update_stat(
|
||||
&self.stats.congestion_events,
|
||||
new_stats.path.congestion_events,
|
||||
);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_streams_blocked_uni
|
||||
.update_stat(
|
||||
&self.stats.tx_streams_blocked_uni,
|
||||
new_stats.frame_tx.streams_blocked_uni,
|
||||
);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_data_blocked
|
||||
.update_stat(&self.stats.tx_data_blocked, new_stats.frame_tx.data_blocked);
|
||||
|
||||
connection_stats
|
||||
.total_client_stats
|
||||
.tx_acks
|
||||
.update_stat(&self.stats.tx_acks, new_stats.frame_tx.acks);
|
||||
|
||||
match Self::_send_buffer_using_conn(data, &connection).await {
|
||||
Ok(()) => Ok(connection),
|
||||
_ => {
|
||||
let connection = {
|
||||
let mut conn_guard = self.connection.lock().await;
|
||||
let conn = conn_guard.as_mut().unwrap();
|
||||
conn.make_connection_0rtt(self.addr, stats).await?
|
||||
};
|
||||
Self::_send_buffer_using_conn(data, &connection).await?;
|
||||
Ok(connection)
|
||||
Err(err) => match err {
|
||||
WriteError::ConnectionLost(_) => {
|
||||
last_error = Some(err);
|
||||
}
|
||||
_ => {
|
||||
info!(
|
||||
"Error sending to {} with id {}, error {:?} thread: {:?}",
|
||||
self.addr,
|
||||
connection.connection.stable_id(),
|
||||
err,
|
||||
thread::current().id(),
|
||||
);
|
||||
return Err(err);
|
||||
}
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
// if we come here, that means we have exhausted maximum retries, return the error
|
||||
info!(
|
||||
"Ran into an error sending transactions {:?}, exhausted retries to {}",
|
||||
last_error, self.addr
|
||||
);
|
||||
Err(last_error.unwrap())
|
||||
}
|
||||
|
||||
pub async fn send_buffer<T>(
|
||||
|
|
|
@ -4,12 +4,12 @@
|
|||
use {
|
||||
crate::{
|
||||
connection_cache::ConnectionCacheStats,
|
||||
nonblocking::quic_client::QuicClient,
|
||||
nonblocking::quic_client::{QuicClient, QuicLazyInitializedEndpoint},
|
||||
tpu_connection::{ClientStats, TpuConnection},
|
||||
},
|
||||
lazy_static::lazy_static,
|
||||
log::*,
|
||||
solana_sdk::{quic::QUIC_PORT_OFFSET, transport::Result as TransportResult},
|
||||
solana_sdk::transport::Result as TransportResult,
|
||||
std::{net::SocketAddr, sync::Arc},
|
||||
tokio::runtime::Runtime,
|
||||
};
|
||||
|
@ -31,9 +31,12 @@ impl QuicTpuConnection {
|
|||
self.client.stats()
|
||||
}
|
||||
|
||||
pub fn new(tpu_addr: SocketAddr, connection_stats: Arc<ConnectionCacheStats>) -> Self {
|
||||
let tpu_addr = SocketAddr::new(tpu_addr.ip(), tpu_addr.port() + QUIC_PORT_OFFSET);
|
||||
let client = Arc::new(QuicClient::new(tpu_addr));
|
||||
pub fn new(
|
||||
endpoint: Arc<QuicLazyInitializedEndpoint>,
|
||||
tpu_addr: SocketAddr,
|
||||
connection_stats: Arc<ConnectionCacheStats>,
|
||||
) -> Self {
|
||||
let client = Arc::new(QuicClient::new(endpoint, tpu_addr));
|
||||
|
||||
Self {
|
||||
client,
|
||||
|
@ -74,7 +77,11 @@ impl TpuConnection for QuicTpuConnection {
|
|||
let send_buffer =
|
||||
client.send_buffer(wire_transaction, &stats, connection_stats.clone());
|
||||
if let Err(e) = send_buffer.await {
|
||||
warn!("Failed to send transaction async to {:?}", e);
|
||||
warn!(
|
||||
"Failed to send transaction async to {}, error: {:?} ",
|
||||
client.tpu_addr(),
|
||||
e
|
||||
);
|
||||
datapoint_warn!("send-wire-async", ("failure", 1, i64),);
|
||||
connection_stats.add_client_stats(&stats, 1, false);
|
||||
} else {
|
||||
|
|
|
@ -3,10 +3,11 @@ mod tests {
|
|||
use {
|
||||
crossbeam_channel::unbounded,
|
||||
solana_client::{
|
||||
connection_cache::ConnectionCacheStats, quic_client::QuicTpuConnection,
|
||||
connection_cache::ConnectionCacheStats,
|
||||
nonblocking::quic_client::QuicLazyInitializedEndpoint, quic_client::QuicTpuConnection,
|
||||
tpu_connection::TpuConnection,
|
||||
},
|
||||
solana_sdk::{packet::PACKET_DATA_SIZE, quic::QUIC_PORT_OFFSET, signature::Keypair},
|
||||
solana_sdk::{packet::PACKET_DATA_SIZE, signature::Keypair},
|
||||
solana_streamer::quic::{spawn_server, StreamStats},
|
||||
std::{
|
||||
collections::HashMap,
|
||||
|
@ -44,10 +45,14 @@ mod tests {
|
|||
.unwrap();
|
||||
|
||||
let addr = s.local_addr().unwrap().ip();
|
||||
let port = s.local_addr().unwrap().port() - QUIC_PORT_OFFSET;
|
||||
let port = s.local_addr().unwrap().port();
|
||||
let tpu_addr = SocketAddr::new(addr, port);
|
||||
let connection_cache_stats = Arc::new(ConnectionCacheStats::default());
|
||||
let client = QuicTpuConnection::new(tpu_addr, connection_cache_stats);
|
||||
let client = QuicTpuConnection::new(
|
||||
Arc::new(QuicLazyInitializedEndpoint::default()),
|
||||
tpu_addr,
|
||||
connection_cache_stats,
|
||||
);
|
||||
|
||||
// Send a full size packet with single byte writes.
|
||||
let num_bytes = PACKET_DATA_SIZE;
|
||||
|
|
|
@ -381,6 +381,7 @@ impl Validator {
|
|||
start_progress: Arc<RwLock<ValidatorStartProgress>>,
|
||||
socket_addr_space: SocketAddrSpace,
|
||||
use_quic: bool,
|
||||
tpu_connection_pool_size: usize,
|
||||
) -> Self {
|
||||
let id = identity_keypair.pubkey();
|
||||
assert_eq!(id, node.info.id);
|
||||
|
@ -748,7 +749,7 @@ impl Validator {
|
|||
};
|
||||
let poh_recorder = Arc::new(Mutex::new(poh_recorder));
|
||||
|
||||
let connection_cache = Arc::new(ConnectionCache::new(use_quic));
|
||||
let connection_cache = Arc::new(ConnectionCache::new(use_quic, tpu_connection_pool_size));
|
||||
|
||||
let rpc_override_health_check = Arc::new(AtomicBool::new(false));
|
||||
let (
|
||||
|
@ -2047,7 +2048,7 @@ mod tests {
|
|||
use {
|
||||
super::*,
|
||||
crossbeam_channel::{bounded, RecvTimeoutError},
|
||||
solana_client::connection_cache::DEFAULT_TPU_USE_QUIC,
|
||||
solana_client::connection_cache::{DEFAULT_TPU_CONNECTION_POOL_SIZE, DEFAULT_TPU_USE_QUIC},
|
||||
solana_ledger::{create_new_tmp_ledger, genesis_utils::create_genesis_config_with_leader},
|
||||
solana_sdk::{genesis_config::create_genesis_config, poh_config::PohConfig},
|
||||
std::{fs::remove_dir_all, thread, time::Duration},
|
||||
|
@ -2084,6 +2085,7 @@ mod tests {
|
|||
start_progress.clone(),
|
||||
SocketAddrSpace::Unspecified,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
);
|
||||
assert_eq!(
|
||||
*start_progress.read().unwrap(),
|
||||
|
@ -2179,6 +2181,7 @@ mod tests {
|
|||
Arc::new(RwLock::new(ValidatorStartProgress::default())),
|
||||
SocketAddrSpace::Unspecified,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
)
|
||||
})
|
||||
.collect();
|
||||
|
|
|
@ -44,7 +44,10 @@ use {
|
|||
log::*,
|
||||
rand::{thread_rng, Rng},
|
||||
solana_bench_tps::{bench::generate_and_fund_keypairs, bench_tps_client::BenchTpsClient},
|
||||
solana_client::{connection_cache::ConnectionCache, rpc_client::RpcClient},
|
||||
solana_client::{
|
||||
connection_cache::{ConnectionCache, DEFAULT_TPU_CONNECTION_POOL_SIZE},
|
||||
rpc_client::RpcClient,
|
||||
},
|
||||
solana_core::serve_repair::RepairProtocol,
|
||||
solana_dos::cli::*,
|
||||
solana_gossip::{
|
||||
|
@ -598,7 +601,10 @@ fn main() {
|
|||
exit(1);
|
||||
});
|
||||
|
||||
let connection_cache = Arc::new(ConnectionCache::new(cmd_params.tpu_use_quic));
|
||||
let connection_cache = Arc::new(ConnectionCache::new(
|
||||
cmd_params.tpu_use_quic,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
));
|
||||
let (client, num_clients) =
|
||||
get_multi_client(&validators, &SocketAddrSpace::Unspecified, connection_cache);
|
||||
if validators.len() < num_clients {
|
||||
|
|
|
@ -7,7 +7,9 @@ use {
|
|||
itertools::izip,
|
||||
log::*,
|
||||
solana_client::{
|
||||
connection_cache::{ConnectionCache, DEFAULT_TPU_USE_QUIC},
|
||||
connection_cache::{
|
||||
ConnectionCache, DEFAULT_TPU_CONNECTION_POOL_SIZE, DEFAULT_TPU_USE_QUIC,
|
||||
},
|
||||
thin_client::ThinClient,
|
||||
},
|
||||
solana_core::{
|
||||
|
@ -80,6 +82,7 @@ pub struct ClusterConfig {
|
|||
pub poh_config: PohConfig,
|
||||
pub additional_accounts: Vec<(Pubkey, AccountSharedData)>,
|
||||
pub tpu_use_quic: bool,
|
||||
pub tpu_connection_pool_size: usize,
|
||||
}
|
||||
|
||||
impl Default for ClusterConfig {
|
||||
|
@ -100,6 +103,7 @@ impl Default for ClusterConfig {
|
|||
skip_warmup_slots: false,
|
||||
additional_accounts: vec![],
|
||||
tpu_use_quic: DEFAULT_TPU_USE_QUIC,
|
||||
tpu_connection_pool_size: DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -255,6 +259,7 @@ impl LocalCluster {
|
|||
Arc::new(RwLock::new(ValidatorStartProgress::default())),
|
||||
socket_addr_space,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
);
|
||||
|
||||
let mut validators = HashMap::new();
|
||||
|
@ -277,7 +282,10 @@ impl LocalCluster {
|
|||
entry_point_info: leader_contact_info,
|
||||
validators,
|
||||
genesis_config,
|
||||
connection_cache: Arc::new(ConnectionCache::new(config.tpu_use_quic)),
|
||||
connection_cache: Arc::new(ConnectionCache::new(
|
||||
config.tpu_use_quic,
|
||||
config.tpu_connection_pool_size,
|
||||
)),
|
||||
};
|
||||
|
||||
let node_pubkey_to_vote_key: HashMap<Pubkey, Arc<Keypair>> = keys_in_genesis
|
||||
|
@ -450,6 +458,7 @@ impl LocalCluster {
|
|||
Arc::new(RwLock::new(ValidatorStartProgress::default())),
|
||||
socket_addr_space,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
);
|
||||
|
||||
let validator_pubkey = validator_keypair.pubkey();
|
||||
|
@ -797,6 +806,7 @@ impl Cluster for LocalCluster {
|
|||
Arc::new(RwLock::new(ValidatorStartProgress::default())),
|
||||
socket_addr_space,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
);
|
||||
cluster_validator_info.validator = Some(restarted_node);
|
||||
cluster_validator_info
|
||||
|
|
|
@ -8,7 +8,7 @@ use {
|
|||
solana_account_decoder::UiAccount,
|
||||
solana_client::{
|
||||
client_error::{ClientErrorKind, Result as ClientResult},
|
||||
connection_cache::ConnectionCache,
|
||||
connection_cache::{ConnectionCache, DEFAULT_TPU_CONNECTION_POOL_SIZE},
|
||||
nonblocking::pubsub_client::PubsubClient,
|
||||
rpc_client::RpcClient,
|
||||
rpc_config::{RpcAccountInfoConfig, RpcSignatureSubscribeConfig},
|
||||
|
@ -420,7 +420,10 @@ fn run_tpu_send_transaction(tpu_use_quic: bool) {
|
|||
test_validator.rpc_url(),
|
||||
CommitmentConfig::processed(),
|
||||
));
|
||||
let connection_cache = Arc::new(ConnectionCache::new(tpu_use_quic));
|
||||
let connection_cache = Arc::new(ConnectionCache::new(
|
||||
tpu_use_quic,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
));
|
||||
let tpu_client = TpuClient::new_with_connection_cache(
|
||||
rpc_client.clone(),
|
||||
&test_validator.rpc_pubsub_url(),
|
||||
|
|
|
@ -2,7 +2,11 @@
|
|||
use {
|
||||
log::*,
|
||||
solana_cli_output::CliAccount,
|
||||
solana_client::{connection_cache::DEFAULT_TPU_USE_QUIC, nonblocking, rpc_client::RpcClient},
|
||||
solana_client::{
|
||||
connection_cache::{DEFAULT_TPU_CONNECTION_POOL_SIZE, DEFAULT_TPU_USE_QUIC},
|
||||
nonblocking,
|
||||
rpc_client::RpcClient,
|
||||
},
|
||||
solana_core::{
|
||||
tower_storage::TowerStorage,
|
||||
validator::{Validator, ValidatorConfig, ValidatorStartProgress},
|
||||
|
@ -749,6 +753,7 @@ impl TestValidator {
|
|||
config.start_progress.clone(),
|
||||
socket_addr_space,
|
||||
DEFAULT_TPU_USE_QUIC,
|
||||
DEFAULT_TPU_CONNECTION_POOL_SIZE,
|
||||
));
|
||||
|
||||
// Needed to avoid panics in `solana-responder-gossip` in tests that create a number of
|
||||
|
|
|
@ -19,8 +19,8 @@ use {
|
|||
keypair::SKIP_SEED_PHRASE_VALIDATION_ARG,
|
||||
},
|
||||
solana_client::{
|
||||
rpc_client::RpcClient, rpc_config::RpcLeaderScheduleConfig,
|
||||
rpc_request::MAX_MULTIPLE_ACCOUNTS,
|
||||
connection_cache::DEFAULT_TPU_CONNECTION_POOL_SIZE, rpc_client::RpcClient,
|
||||
rpc_config::RpcLeaderScheduleConfig, rpc_request::MAX_MULTIPLE_ACCOUNTS,
|
||||
},
|
||||
solana_core::{
|
||||
ledger_cleanup_service::{DEFAULT_MAX_LEDGER_SHREDS, DEFAULT_MIN_MAX_LEDGER_SHREDS},
|
||||
|
@ -468,6 +468,7 @@ pub fn main() {
|
|||
let default_accounts_shrink_ratio = &DEFAULT_ACCOUNTS_SHRINK_RATIO.to_string();
|
||||
let default_rocksdb_fifo_shred_storage_size =
|
||||
&DEFAULT_ROCKS_FIFO_SHRED_STORAGE_SIZE_BYTES.to_string();
|
||||
let default_tpu_connection_pool_size = &DEFAULT_TPU_CONNECTION_POOL_SIZE.to_string();
|
||||
|
||||
let matches = App::new(crate_name!()).about(crate_description!())
|
||||
.version(solana_version::version!())
|
||||
|
@ -1209,6 +1210,14 @@ pub fn main() {
|
|||
.takes_value(false)
|
||||
.help("Use QUIC to send transactions."),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("tpu_connection_pool_size")
|
||||
.long("tpu-connection-pool-size")
|
||||
.takes_value(true)
|
||||
.default_value(default_tpu_connection_pool_size)
|
||||
.validator(is_parsable::<usize>)
|
||||
.help("Controls the TPU connection pool size per remote addresss"),
|
||||
)
|
||||
.arg(
|
||||
Arg::with_name("rocksdb_max_compaction_jitter")
|
||||
.long("rocksdb-max-compaction-jitter-slots")
|
||||
|
@ -2214,6 +2223,7 @@ pub fn main() {
|
|||
let accounts_shrink_optimize_total_space =
|
||||
value_t_or_exit!(matches, "accounts_shrink_optimize_total_space", bool);
|
||||
let tpu_use_quic = matches.is_present("tpu_use_quic");
|
||||
let tpu_connection_pool_size = value_t_or_exit!(matches, "tpu_connection_pool_size", usize);
|
||||
|
||||
let shrink_ratio = value_t_or_exit!(matches, "accounts_shrink_ratio", f64);
|
||||
if !(0.0..=1.0).contains(&shrink_ratio) {
|
||||
|
@ -2973,6 +2983,7 @@ pub fn main() {
|
|||
start_progress,
|
||||
socket_addr_space,
|
||||
tpu_use_quic,
|
||||
tpu_connection_pool_size,
|
||||
);
|
||||
*admin_service_post_init.write().unwrap() =
|
||||
Some(admin_rpc_service::AdminRpcRequestMetadataPostInit {
|
||||
|
|
Loading…
Reference in New Issue