2019-11-26 23:04:05 -08:00
|
|
|
//! A peer set whose size is dynamically determined by resource constraints.
|
|
|
|
|
|
|
|
// Portions of this submodule were adapted from tower-balance,
|
|
|
|
// which is (c) 2019 Tower Contributors (MIT licensed).
|
|
|
|
|
2021-10-21 16:04:46 -07:00
|
|
|
use std::{collections::HashSet, net::SocketAddr, sync::Arc};
|
2019-11-26 23:04:05 -08:00
|
|
|
|
|
|
|
use futures::{
|
2020-08-11 13:07:44 -07:00
|
|
|
future::{self, FutureExt},
|
2019-11-26 23:04:05 -08:00
|
|
|
sink::SinkExt,
|
2021-12-08 18:54:29 -08:00
|
|
|
stream::{FuturesUnordered, StreamExt, TryStreamExt},
|
2021-04-13 00:46:17 -07:00
|
|
|
TryFutureExt,
|
2019-11-26 23:04:05 -08:00
|
|
|
};
|
2021-10-21 16:04:46 -07:00
|
|
|
use rand::seq::SliceRandom;
|
2021-10-27 16:46:43 -07:00
|
|
|
use tokio::{
|
|
|
|
net::TcpListener,
|
|
|
|
sync::broadcast,
|
|
|
|
time::{sleep, Instant},
|
|
|
|
};
|
2021-11-02 11:46:57 -07:00
|
|
|
use tokio_stream::wrappers::IntervalStream;
|
2019-11-26 23:04:05 -08:00
|
|
|
use tower::{
|
2021-12-08 18:54:29 -08:00
|
|
|
buffer::Buffer, discover::Change, layer::Layer, util::BoxService, Service, ServiceExt,
|
2019-11-26 23:04:05 -08:00
|
|
|
};
|
2021-02-19 14:36:50 -08:00
|
|
|
use tracing::Span;
|
|
|
|
use tracing_futures::Instrument;
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-08-26 18:34:33 -07:00
|
|
|
use zebra_chain::{chain_tip::ChainTip, parameters::Network};
|
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
use crate::{
|
2021-11-04 04:34:00 -07:00
|
|
|
address_book_updater::AddressBookUpdater,
|
2021-10-21 14:36:42 -07:00
|
|
|
constants,
|
2021-11-04 04:34:00 -07:00
|
|
|
meta_addr::{MetaAddr, MetaAddrChange},
|
2021-12-08 18:54:29 -08:00
|
|
|
peer::{self, HandshakeRequest, MinimumPeerVersion, OutboundConnectorRequest},
|
2021-10-21 18:26:04 -07:00
|
|
|
peer_set::{set::MorePeers, ActiveConnectionCounter, CandidateSet, ConnectionTracker, PeerSet},
|
2021-10-21 14:36:42 -07:00
|
|
|
AddressBook, BoxError, Config, Request, Response,
|
|
|
|
};
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-06-22 14:59:06 -07:00
|
|
|
#[cfg(test)]
|
|
|
|
mod tests;
|
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
/// The result of an outbound peer connection attempt or inbound connection handshake.
|
|
|
|
///
|
|
|
|
/// This result comes from the [`Handshaker`].
|
2021-12-08 18:54:29 -08:00
|
|
|
type DiscoveredPeer = Result<(SocketAddr, peer::Client), BoxError>;
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-08-26 18:34:33 -07:00
|
|
|
/// Initialize a peer set, using a network `config`, `inbound_service`,
|
2021-09-01 15:31:16 -07:00
|
|
|
/// and `latest_chain_tip`.
|
2020-09-18 12:37:01 -07:00
|
|
|
///
|
|
|
|
/// The peer set abstracts away peer management to provide a
|
|
|
|
/// [`tower::Service`] representing "the network" that load-balances requests
|
|
|
|
/// over available peers. The peer set automatically crawls the network to
|
|
|
|
/// find more peer addresses and opportunistically connects to new peers.
|
|
|
|
///
|
|
|
|
/// Each peer connection's message handling is isolated from other
|
|
|
|
/// connections, unlike in `zcashd`. The peer connection first attempts to
|
|
|
|
/// interpret inbound messages as part of a response to a previously-issued
|
|
|
|
/// request. Otherwise, inbound messages are interpreted as requests and sent
|
|
|
|
/// to the supplied `inbound_service`.
|
|
|
|
///
|
|
|
|
/// Wrapping the `inbound_service` in [`tower::load_shed`] middleware will
|
|
|
|
/// cause the peer set to shrink when the inbound service is unable to keep up
|
|
|
|
/// with the volume of inbound requests.
|
|
|
|
///
|
2021-08-26 18:34:33 -07:00
|
|
|
/// Use [`NoChainTip`] to explicitly provide no chain tip receiver.
|
|
|
|
///
|
2020-09-18 12:37:01 -07:00
|
|
|
/// In addition to returning a service for outbound requests, this method
|
|
|
|
/// returns a shared [`AddressBook`] updated with last-seen timestamps for
|
2021-12-19 16:44:43 -08:00
|
|
|
/// connected peers. The shared address book should be accessed using a
|
|
|
|
/// [blocking thread](https://docs.rs/tokio/1.15.0/tokio/task/index.html#blocking-and-yielding),
|
|
|
|
/// to avoid async task deadlocks.
|
2021-10-27 14:28:51 -07:00
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// If `config.config.peerset_initial_target_size` is zero.
|
|
|
|
/// (zebra-network expects to be able to connect to at least one peer.)
|
2021-08-26 18:34:33 -07:00
|
|
|
pub async fn init<S, C>(
|
2019-11-26 23:04:05 -08:00
|
|
|
config: Config,
|
|
|
|
inbound_service: S,
|
2021-09-01 15:31:16 -07:00
|
|
|
latest_chain_tip: C,
|
2019-11-26 23:04:05 -08:00
|
|
|
) -> (
|
2020-09-18 11:20:55 -07:00
|
|
|
Buffer<BoxService<Request, Response, BoxError>, Request>,
|
2021-04-18 23:04:24 -07:00
|
|
|
Arc<std::sync::Mutex<AddressBook>>,
|
2019-11-26 23:04:05 -08:00
|
|
|
)
|
|
|
|
where
|
2020-09-18 11:20:55 -07:00
|
|
|
S: Service<Request, Response = Response, Error = BoxError> + Clone + Send + 'static,
|
2019-11-26 23:04:05 -08:00
|
|
|
S::Future: Send + 'static,
|
2021-08-26 18:34:33 -07:00
|
|
|
C: ChainTip + Clone + Send + 'static,
|
2019-11-26 23:04:05 -08:00
|
|
|
{
|
2021-10-27 14:28:51 -07:00
|
|
|
// If we want Zebra to operate with no network,
|
|
|
|
// we should implement a `zebrad` command that doesn't use `zebra-network`.
|
|
|
|
assert!(
|
|
|
|
config.peerset_initial_target_size > 0,
|
|
|
|
"Zebra must be allowed to connect to at least one peer"
|
|
|
|
);
|
|
|
|
|
2021-06-22 14:59:06 -07:00
|
|
|
let (tcp_listener, listen_addr) = open_listener(&config.clone()).await;
|
|
|
|
|
2021-12-19 16:44:43 -08:00
|
|
|
let (address_book, address_book_updater, address_metrics, address_book_updater_guard) =
|
2021-11-18 04:34:51 -08:00
|
|
|
AddressBookUpdater::spawn(&config, listen_addr);
|
2021-10-27 14:28:51 -07:00
|
|
|
|
|
|
|
// Create a broadcast channel for peer inventory advertisements.
|
|
|
|
// If it reaches capacity, this channel drops older inventory advertisements.
|
|
|
|
//
|
|
|
|
// When Zebra is at the chain tip with an up-to-date mempool,
|
|
|
|
// we expect to have at most 1 new transaction per connected peer,
|
|
|
|
// and 1-2 new blocks across the entire network.
|
|
|
|
// (The block syncer and mempool crawler handle bulk fetches of blocks and transactions.)
|
|
|
|
let (inv_sender, inv_receiver) = broadcast::channel(config.peerset_total_connection_limit());
|
2019-11-26 23:04:05 -08:00
|
|
|
|
|
|
|
// Construct services that handle inbound handshakes and perform outbound
|
|
|
|
// handshakes. These use the same handshake service internally to detect
|
|
|
|
// self-connection attempts. Both are decorated with a tower TimeoutLayer to
|
|
|
|
// enforce timeouts as specified in the Config.
|
2021-05-06 17:50:04 -07:00
|
|
|
let (listen_handshaker, outbound_connector) = {
|
2019-11-26 23:04:05 -08:00
|
|
|
use tower::timeout::TimeoutLayer;
|
2020-08-06 11:29:00 -07:00
|
|
|
let hs_timeout = TimeoutLayer::new(constants::HANDSHAKE_TIMEOUT);
|
2020-08-31 21:32:35 -07:00
|
|
|
use crate::protocol::external::types::PeerServices;
|
|
|
|
let hs = peer::Handshake::builder()
|
|
|
|
.with_config(config.clone())
|
|
|
|
.with_inbound_service(inbound_service)
|
2020-09-01 14:28:54 -07:00
|
|
|
.with_inventory_collector(inv_sender)
|
2021-11-04 04:34:00 -07:00
|
|
|
.with_address_book_updater(address_book_updater.clone())
|
2020-08-31 21:32:35 -07:00
|
|
|
.with_advertised_services(PeerServices::NODE_NETWORK)
|
|
|
|
.with_user_agent(crate::constants::USER_AGENT.to_string())
|
2021-12-08 18:54:29 -08:00
|
|
|
.with_latest_chain_tip(latest_chain_tip.clone())
|
2020-09-01 10:55:55 -07:00
|
|
|
.want_transactions(true)
|
2020-08-31 21:32:35 -07:00
|
|
|
.finish()
|
|
|
|
.expect("configured all required parameters");
|
2019-11-26 23:04:05 -08:00
|
|
|
(
|
|
|
|
hs_timeout.layer(hs.clone()),
|
2019-11-27 11:43:59 -08:00
|
|
|
hs_timeout.layer(peer::Connector::new(hs)),
|
2019-11-26 23:04:05 -08:00
|
|
|
)
|
|
|
|
};
|
|
|
|
|
2021-10-27 14:28:51 -07:00
|
|
|
// Create an mpsc channel for peer changes,
|
|
|
|
// based on the maximum number of inbound and outbound peers.
|
|
|
|
let (peerset_tx, peerset_rx) =
|
2021-12-19 16:44:43 -08:00
|
|
|
futures::channel::mpsc::channel::<DiscoveredPeer>(config.peerset_total_connection_limit());
|
2021-12-08 18:54:29 -08:00
|
|
|
|
|
|
|
let discovered_peers = peerset_rx
|
|
|
|
// Discover interprets an error as stream termination,
|
|
|
|
// so discard any errored connections...
|
|
|
|
.filter(|result| future::ready(result.is_ok()))
|
|
|
|
.map_ok(|(address, client)| Change::Insert(address, client.into()));
|
|
|
|
|
2021-10-27 14:28:51 -07:00
|
|
|
// Create an mpsc channel for peerset demand signaling,
|
|
|
|
// based on the maximum number of outbound peers.
|
|
|
|
let (mut demand_tx, demand_rx) =
|
2021-12-19 16:44:43 -08:00
|
|
|
futures::channel::mpsc::channel::<MorePeers>(config.peerset_outbound_connection_limit());
|
2021-10-21 18:26:04 -07:00
|
|
|
|
|
|
|
// Create a oneshot to send background task JoinHandles to the peer set
|
2020-06-09 12:24:28 -07:00
|
|
|
let (handle_tx, handle_rx) = tokio::sync::oneshot::channel();
|
2019-11-26 23:04:05 -08:00
|
|
|
|
|
|
|
// Connect the rx end to a PeerSet, wrapping new peers in load instruments.
|
2020-06-09 12:24:28 -07:00
|
|
|
let peer_set = PeerSet::new(
|
2021-10-27 18:49:31 -07:00
|
|
|
&config,
|
2021-12-08 18:54:29 -08:00
|
|
|
discovered_peers,
|
2020-06-09 12:24:28 -07:00
|
|
|
demand_tx.clone(),
|
|
|
|
handle_rx,
|
2020-09-01 14:28:54 -07:00
|
|
|
inv_receiver,
|
2021-12-19 16:44:43 -08:00
|
|
|
address_metrics,
|
2021-12-08 18:54:29 -08:00
|
|
|
MinimumPeerVersion::new(latest_chain_tip, config.network),
|
2019-11-26 23:04:05 -08:00
|
|
|
);
|
2020-08-11 13:07:44 -07:00
|
|
|
let peer_set = Buffer::new(BoxService::new(peer_set), constants::PEERSET_BUFFER_SIZE);
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
// Connect peerset_tx to the 3 peer sources:
|
|
|
|
//
|
2021-01-29 04:36:33 -08:00
|
|
|
// 1. Incoming peer connections, via a listener.
|
2021-10-27 18:49:31 -07:00
|
|
|
let listen_fut = accept_inbound_connections(
|
|
|
|
config.clone(),
|
|
|
|
tcp_listener,
|
|
|
|
listen_handshaker,
|
|
|
|
peerset_tx.clone(),
|
2021-02-19 14:36:50 -08:00
|
|
|
);
|
2021-10-27 18:49:31 -07:00
|
|
|
let listen_guard = tokio::spawn(listen_fut.instrument(Span::current()));
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-02-02 18:20:26 -08:00
|
|
|
// 2. Initial peers, specified in the config.
|
2021-10-27 18:49:31 -07:00
|
|
|
let initial_peers_fut = add_initial_peers(
|
|
|
|
config.clone(),
|
|
|
|
outbound_connector.clone(),
|
|
|
|
peerset_tx.clone(),
|
2021-11-04 04:34:00 -07:00
|
|
|
address_book_updater,
|
2021-10-27 18:49:31 -07:00
|
|
|
);
|
2021-10-21 14:36:42 -07:00
|
|
|
let initial_peers_join = tokio::spawn(initial_peers_fut.instrument(Span::current()));
|
2021-01-29 04:36:33 -08:00
|
|
|
|
2019-11-26 23:04:05 -08:00
|
|
|
// 3. Outgoing peers we connect to in response to load.
|
|
|
|
let mut candidates = CandidateSet::new(address_book.clone(), peer_set.clone());
|
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
// Wait for the initial seed peer count
|
|
|
|
let mut active_outbound_connections = initial_peers_join
|
|
|
|
.await
|
|
|
|
.expect("unexpected panic in spawned initial peers task")
|
|
|
|
.expect("unexpected error connecting to initial peers");
|
|
|
|
let active_initial_peer_count = active_outbound_connections.update_count();
|
|
|
|
|
2021-11-18 04:34:51 -08:00
|
|
|
// We need to await candidates.update() here,
|
|
|
|
// because zcashd rate-limits `addr`/`addrv2` messages per connection,
|
|
|
|
// and if we only have one initial peer,
|
|
|
|
// we need to ensure that its `Response::Addr` is used by the crawler.
|
|
|
|
//
|
|
|
|
// TODO: cache the most recent `Response::Addr` returned by each peer.
|
|
|
|
// If the request times out, return the cached response to the caller.
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
info!(
|
|
|
|
?active_initial_peer_count,
|
|
|
|
"sending initial request for peers"
|
|
|
|
);
|
|
|
|
let _ = candidates.update_initial(active_initial_peer_count).await;
|
2020-02-09 20:34:53 -08:00
|
|
|
|
2021-10-29 09:29:52 -07:00
|
|
|
// Compute remaining connections to open.
|
|
|
|
let demand_count = config
|
|
|
|
.peerset_initial_target_size
|
|
|
|
.saturating_sub(active_outbound_connections.update_count());
|
|
|
|
|
|
|
|
for _ in 0..demand_count {
|
2021-10-21 18:26:04 -07:00
|
|
|
let _ = demand_tx.try_send(MorePeers);
|
2020-02-09 20:34:53 -08:00
|
|
|
}
|
|
|
|
|
2021-10-27 18:49:31 -07:00
|
|
|
let crawl_fut = crawl_and_dial(
|
|
|
|
config,
|
|
|
|
demand_tx,
|
|
|
|
demand_rx,
|
|
|
|
candidates,
|
|
|
|
outbound_connector,
|
|
|
|
peerset_tx,
|
|
|
|
active_outbound_connections,
|
|
|
|
);
|
2021-10-27 14:28:51 -07:00
|
|
|
let crawl_guard = tokio::spawn(crawl_fut.instrument(Span::current()));
|
2020-06-09 12:24:28 -07:00
|
|
|
|
2021-11-18 04:34:51 -08:00
|
|
|
handle_tx
|
|
|
|
.send(vec![listen_guard, crawl_guard, address_book_updater_guard])
|
|
|
|
.unwrap();
|
2019-11-26 23:04:05 -08:00
|
|
|
|
|
|
|
(peer_set, address_book)
|
|
|
|
}
|
|
|
|
|
2021-10-21 16:04:46 -07:00
|
|
|
/// Use the provided `outbound_connector` to connect to the configured initial peers,
|
|
|
|
/// then send the resulting peer connections over `peerset_tx`.
|
2021-11-07 14:21:51 -08:00
|
|
|
///
|
2022-01-05 15:12:59 -08:00
|
|
|
/// Also sends every initial peer address to the `address_book_updater`.
|
2021-11-07 14:21:51 -08:00
|
|
|
#[instrument(skip(config, outbound_connector, peerset_tx, address_book_updater))]
|
2019-11-26 23:04:05 -08:00
|
|
|
async fn add_initial_peers<S>(
|
2021-10-25 13:16:35 -07:00
|
|
|
config: Config,
|
2021-05-06 17:50:04 -07:00
|
|
|
outbound_connector: S,
|
2021-12-19 16:44:43 -08:00
|
|
|
mut peerset_tx: futures::channel::mpsc::Sender<DiscoveredPeer>,
|
|
|
|
address_book_updater: tokio::sync::mpsc::Sender<MetaAddrChange>,
|
2021-10-21 14:36:42 -07:00
|
|
|
) -> Result<ActiveConnectionCounter, BoxError>
|
2020-06-09 12:24:28 -07:00
|
|
|
where
|
2021-12-08 18:54:29 -08:00
|
|
|
S: Service<OutboundConnectorRequest, Response = (SocketAddr, peer::Client), Error = BoxError>
|
2021-12-09 17:18:43 -08:00
|
|
|
+ Clone
|
|
|
|
+ Send
|
|
|
|
+ 'static,
|
2019-11-26 23:04:05 -08:00
|
|
|
S::Future: Send + 'static,
|
|
|
|
{
|
2021-11-04 04:34:00 -07:00
|
|
|
let initial_peers = limit_initial_peers(&config, address_book_updater).await;
|
2021-10-21 16:04:46 -07:00
|
|
|
|
2021-10-18 11:43:12 -07:00
|
|
|
let mut handshake_success_total: usize = 0;
|
|
|
|
let mut handshake_error_total: usize = 0;
|
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
let mut active_outbound_connections = ActiveConnectionCounter::new_counter();
|
|
|
|
|
2021-10-18 11:43:12 -07:00
|
|
|
info!(
|
2021-10-21 16:04:46 -07:00
|
|
|
initial_peer_count = ?initial_peers.len(),
|
2021-10-18 11:43:12 -07:00
|
|
|
?initial_peers,
|
|
|
|
"connecting to initial peer set"
|
|
|
|
);
|
|
|
|
|
2021-06-21 19:16:59 -07:00
|
|
|
// # Security
|
2021-02-14 17:43:49 -08:00
|
|
|
//
|
2021-10-27 16:46:43 -07:00
|
|
|
// Resists distributed denial of service attacks by making sure that
|
|
|
|
// new peer connections are initiated at least
|
|
|
|
// [`MIN_PEER_CONNECTION_INTERVAL`][constants::MIN_PEER_CONNECTION_INTERVAL]
|
|
|
|
// apart.
|
2021-06-21 19:16:59 -07:00
|
|
|
//
|
|
|
|
// # Correctness
|
|
|
|
//
|
|
|
|
// Each `FuturesUnordered` can hold one `Buffer` or `Batch` reservation for
|
|
|
|
// an indefinite period. We can use `FuturesUnordered` without filling
|
|
|
|
// the underlying network buffers, because we immediately drive this
|
|
|
|
// single `FuturesUnordered` to completion, and handshakes have a short timeout.
|
2021-05-17 13:49:16 -07:00
|
|
|
let mut handshakes: FuturesUnordered<_> = initial_peers
|
|
|
|
.into_iter()
|
2021-10-27 16:46:43 -07:00
|
|
|
.enumerate()
|
|
|
|
.map(|(i, addr)| {
|
2021-10-21 14:36:42 -07:00
|
|
|
let connection_tracker = active_outbound_connections.track_connection();
|
|
|
|
let req = OutboundConnectorRequest {
|
|
|
|
addr,
|
|
|
|
connection_tracker,
|
|
|
|
};
|
|
|
|
|
2021-12-09 17:18:43 -08:00
|
|
|
// Construct a connector future but do not drive it yet ...
|
|
|
|
let outbound_connector_future = outbound_connector
|
|
|
|
.clone()
|
|
|
|
.oneshot(req)
|
|
|
|
.map_err(move |e| (addr, e));
|
|
|
|
|
|
|
|
// ... instead, spawn a new task to handle this connector
|
|
|
|
tokio::spawn(async move {
|
|
|
|
let task = outbound_connector_future.await;
|
|
|
|
// Only spawn one outbound connector per `MIN_PEER_CONNECTION_INTERVAL`,
|
|
|
|
// sleeping for an interval according to its index in the list.
|
2021-10-27 16:46:43 -07:00
|
|
|
sleep(constants::MIN_PEER_CONNECTION_INTERVAL.saturating_mul(i as u32)).await;
|
2021-12-09 17:18:43 -08:00
|
|
|
task
|
|
|
|
})
|
2021-05-17 13:49:16 -07:00
|
|
|
})
|
|
|
|
.collect();
|
2020-06-09 12:24:28 -07:00
|
|
|
|
2019-11-26 23:04:05 -08:00
|
|
|
while let Some(handshake_result) = handshakes.next().await {
|
2021-12-09 17:18:43 -08:00
|
|
|
let handshake_result =
|
|
|
|
handshake_result.expect("unexpected panic in initial peer handshake");
|
2021-10-18 11:43:12 -07:00
|
|
|
match handshake_result {
|
|
|
|
Ok(ref change) => {
|
|
|
|
handshake_success_total += 1;
|
|
|
|
debug!(
|
|
|
|
?handshake_success_total,
|
|
|
|
?handshake_error_total,
|
|
|
|
?change,
|
|
|
|
"an initial peer handshake succeeded"
|
|
|
|
);
|
|
|
|
}
|
|
|
|
Err((addr, ref e)) => {
|
|
|
|
handshake_error_total += 1;
|
2021-10-21 19:11:09 -07:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
// this is verbose, but it's better than just hanging with no output when there are errors
|
2021-10-21 19:11:09 -07:00
|
|
|
let mut expected_error = false;
|
|
|
|
if let Some(io_error) = e.downcast_ref::<tokio::io::Error>() {
|
|
|
|
// Some systems only have IPv4, or only have IPv6,
|
|
|
|
// so these errors are not particularly interesting.
|
|
|
|
if io_error.kind() == tokio::io::ErrorKind::AddrNotAvailable {
|
|
|
|
expected_error = true;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if expected_error {
|
|
|
|
debug!(
|
|
|
|
successes = ?handshake_success_total,
|
|
|
|
errors = ?handshake_error_total,
|
|
|
|
?addr,
|
|
|
|
?e,
|
|
|
|
"an initial peer connection failed"
|
|
|
|
);
|
|
|
|
} else {
|
|
|
|
info!(
|
|
|
|
successes = ?handshake_success_total,
|
|
|
|
errors = ?handshake_error_total,
|
|
|
|
?addr,
|
|
|
|
%e,
|
|
|
|
"an initial peer connection failed"
|
|
|
|
);
|
|
|
|
}
|
2021-10-18 11:43:12 -07:00
|
|
|
}
|
2021-02-16 17:01:36 -08:00
|
|
|
}
|
2021-10-18 11:43:12 -07:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
peerset_tx
|
|
|
|
.send(handshake_result.map_err(|(_addr, e)| e))
|
|
|
|
.await?;
|
2021-10-27 14:28:51 -07:00
|
|
|
|
|
|
|
// Security: Let other tasks run after each connection is processed.
|
|
|
|
//
|
|
|
|
// Avoids remote peers starving other Zebra tasks using initial connection successes or errors.
|
|
|
|
tokio::task::yield_now().await;
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
2020-06-09 12:24:28 -07:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
let outbound_connections = active_outbound_connections.update_count();
|
2021-10-18 11:43:12 -07:00
|
|
|
info!(
|
|
|
|
?handshake_success_total,
|
|
|
|
?handshake_error_total,
|
2021-10-21 14:36:42 -07:00
|
|
|
?outbound_connections,
|
2021-10-18 11:43:12 -07:00
|
|
|
"finished connecting to initial seed peers"
|
|
|
|
);
|
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
Ok(active_outbound_connections)
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
|
|
|
|
2021-10-21 16:04:46 -07:00
|
|
|
/// Limit the number of `initial_peers` addresses entries to the configured
|
|
|
|
/// `peerset_initial_target_size`.
|
|
|
|
///
|
2022-01-05 15:12:59 -08:00
|
|
|
/// Returns randomly chosen entries from the provided set of addresses,
|
|
|
|
/// in a random order.
|
2021-11-07 14:21:51 -08:00
|
|
|
///
|
2022-01-05 15:12:59 -08:00
|
|
|
/// Also sends every initial peer to the `address_book_updater`.
|
2021-11-04 04:34:00 -07:00
|
|
|
async fn limit_initial_peers(
|
|
|
|
config: &Config,
|
2021-12-19 16:44:43 -08:00
|
|
|
address_book_updater: tokio::sync::mpsc::Sender<MetaAddrChange>,
|
2021-11-04 04:34:00 -07:00
|
|
|
) -> HashSet<SocketAddr> {
|
|
|
|
let all_peers = config.initial_peers().await;
|
|
|
|
let peers_count = all_peers.len();
|
|
|
|
|
2022-01-05 15:12:59 -08:00
|
|
|
// # Correctness
|
|
|
|
//
|
|
|
|
// We can't exit early if we only have a few peers,
|
|
|
|
// because we still need to shuffle the connection order.
|
|
|
|
|
|
|
|
if all_peers.len() > config.peerset_initial_target_size {
|
|
|
|
info!(
|
|
|
|
"limiting the initial peers list from {} to {}",
|
|
|
|
peers_count, config.peerset_initial_target_size
|
|
|
|
);
|
2021-11-04 04:34:00 -07:00
|
|
|
}
|
2021-10-21 16:04:46 -07:00
|
|
|
|
2022-01-05 15:12:59 -08:00
|
|
|
// Split out the `initial_peers` that will be shuffled and returned.
|
|
|
|
let mut initial_peers: Vec<SocketAddr> = all_peers.iter().cloned().collect();
|
|
|
|
let (initial_peers, _unused_peers) =
|
|
|
|
initial_peers.partial_shuffle(&mut rand::thread_rng(), config.peerset_initial_target_size);
|
2021-11-04 04:34:00 -07:00
|
|
|
|
2022-01-05 15:12:59 -08:00
|
|
|
// Send every initial peer to the address book.
|
|
|
|
// (This treats initial peers the same way we treat gossiped peers.)
|
|
|
|
for peer in all_peers {
|
|
|
|
let peer_addr = MetaAddr::new_initial_peer(peer);
|
2021-11-04 04:34:00 -07:00
|
|
|
// `send` only waits when the channel is full.
|
2022-01-05 15:12:59 -08:00
|
|
|
// The address book updater runs in its own thread, so we will only wait for a short time.
|
2021-11-04 04:34:00 -07:00
|
|
|
let _ = address_book_updater.send(peer_addr).await;
|
|
|
|
}
|
2021-10-21 16:04:46 -07:00
|
|
|
|
2021-11-04 04:34:00 -07:00
|
|
|
initial_peers.iter().copied().collect()
|
2021-10-21 16:04:46 -07:00
|
|
|
}
|
|
|
|
|
2021-06-22 14:59:06 -07:00
|
|
|
/// Open a peer connection listener on `config.listen_addr`,
|
|
|
|
/// returning the opened [`TcpListener`], and the address it is bound to.
|
2021-05-06 17:50:04 -07:00
|
|
|
///
|
2021-06-22 14:59:06 -07:00
|
|
|
/// If the listener is configured to use an automatically chosen port (port `0`),
|
|
|
|
/// then the returned address will contain the actual port.
|
|
|
|
///
|
|
|
|
/// # Panics
|
|
|
|
///
|
|
|
|
/// If opening the listener fails.
|
|
|
|
#[instrument(skip(config), fields(addr = ?config.listen_addr))]
|
2021-11-04 04:34:00 -07:00
|
|
|
pub(crate) async fn open_listener(config: &Config) -> (TcpListener, SocketAddr) {
|
2021-06-22 14:59:06 -07:00
|
|
|
// Warn if we're configured using the wrong network port.
|
|
|
|
use Network::*;
|
|
|
|
let wrong_net = match config.network {
|
|
|
|
Mainnet => Testnet,
|
|
|
|
Testnet => Mainnet,
|
|
|
|
};
|
|
|
|
if config.listen_addr.port() == wrong_net.default_port() {
|
|
|
|
warn!(
|
2021-10-11 18:13:13 -07:00
|
|
|
"We are configured with port {} for {:?}, but that port is the default port for {:?}. The default port for {:?} is {}.",
|
2021-06-22 14:59:06 -07:00
|
|
|
config.listen_addr.port(),
|
|
|
|
config.network,
|
2021-10-11 18:13:13 -07:00
|
|
|
wrong_net,
|
|
|
|
config.network,
|
|
|
|
config.network.default_port(),
|
2021-06-22 14:59:06 -07:00
|
|
|
);
|
|
|
|
}
|
|
|
|
|
|
|
|
info!(
|
|
|
|
"Trying to open Zcash protocol endpoint at {}...",
|
|
|
|
config.listen_addr
|
|
|
|
);
|
|
|
|
let listener_result = TcpListener::bind(config.listen_addr).await;
|
2021-01-29 04:36:33 -08:00
|
|
|
|
|
|
|
let listener = match listener_result {
|
|
|
|
Ok(l) => l,
|
|
|
|
Err(e) => panic!(
|
|
|
|
"Opening Zcash network protocol listener {:?} failed: {:?}. \
|
|
|
|
Hint: Check if another zebrad or zcashd process is running. \
|
|
|
|
Try changing the network listen_addr in the Zebra config.",
|
2021-06-22 14:59:06 -07:00
|
|
|
config.listen_addr, e,
|
2021-01-29 04:36:33 -08:00
|
|
|
),
|
|
|
|
};
|
|
|
|
|
2021-06-22 14:59:06 -07:00
|
|
|
let local_addr = listener
|
|
|
|
.local_addr()
|
|
|
|
.expect("unexpected missing local addr for open listener");
|
2020-08-12 14:22:54 -07:00
|
|
|
info!("Opened Zcash protocol endpoint at {}", local_addr);
|
2021-06-22 14:59:06 -07:00
|
|
|
|
|
|
|
(listener, local_addr)
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Listens for peer connections on `addr`, then sets up each connection as a
|
|
|
|
/// Zcash peer.
|
|
|
|
///
|
|
|
|
/// Uses `handshaker` to perform a Zcash network protocol handshake, and sends
|
2021-10-21 14:36:42 -07:00
|
|
|
/// the [`peer::Client`] result over `peerset_tx`.
|
2021-10-27 18:49:31 -07:00
|
|
|
///
|
|
|
|
/// Limit the number of active inbound connections based on `config`.
|
|
|
|
#[instrument(skip(config, listener, handshaker, peerset_tx), fields(listener_addr = ?listener.local_addr()))]
|
2021-06-22 14:59:06 -07:00
|
|
|
async fn accept_inbound_connections<S>(
|
2021-10-27 18:49:31 -07:00
|
|
|
config: Config,
|
2021-06-22 14:59:06 -07:00
|
|
|
listener: TcpListener,
|
|
|
|
mut handshaker: S,
|
2021-12-19 16:44:43 -08:00
|
|
|
peerset_tx: futures::channel::mpsc::Sender<DiscoveredPeer>,
|
2021-06-22 14:59:06 -07:00
|
|
|
) -> Result<(), BoxError>
|
|
|
|
where
|
|
|
|
S: Service<peer::HandshakeRequest, Response = peer::Client, Error = BoxError> + Clone,
|
|
|
|
S::Future: Send + 'static,
|
|
|
|
{
|
2021-10-21 14:36:42 -07:00
|
|
|
let mut active_inbound_connections = ActiveConnectionCounter::new_counter();
|
|
|
|
|
2021-12-10 10:32:42 -08:00
|
|
|
let mut handshakes = FuturesUnordered::new();
|
|
|
|
// Keeping an unresolved future in the pool means the stream never terminates.
|
|
|
|
handshakes.push(future::pending().boxed());
|
|
|
|
|
2019-11-26 23:04:05 -08:00
|
|
|
loop {
|
2021-12-10 10:32:42 -08:00
|
|
|
// Check for panics in finished tasks, before accepting new connections
|
|
|
|
let inbound_result = tokio::select! {
|
|
|
|
biased;
|
|
|
|
next_handshake_res = handshakes.next() => match next_handshake_res {
|
|
|
|
// The task has already sent the peer change to the peer set.
|
|
|
|
Some(Ok(_)) => continue,
|
|
|
|
Some(Err(task_panic)) => panic!("panic in inbound handshake task: {:?}", task_panic),
|
|
|
|
None => unreachable!("handshakes never terminates, because it contains a future that never resolves"),
|
|
|
|
},
|
|
|
|
|
|
|
|
inbound_result = listener.accept() => inbound_result,
|
|
|
|
};
|
|
|
|
|
2021-10-27 18:49:31 -07:00
|
|
|
if let Ok((tcp_stream, addr)) = inbound_result {
|
|
|
|
if active_inbound_connections.update_count()
|
|
|
|
>= config.peerset_inbound_connection_limit()
|
|
|
|
{
|
|
|
|
// Too many open inbound connections or pending handshakes already.
|
|
|
|
// Close the connection.
|
|
|
|
std::mem::drop(tcp_stream);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
// The peer already opened a connection to us.
|
|
|
|
// So we want to increment the connection count as soon as possible.
|
2021-10-21 14:36:42 -07:00
|
|
|
let connection_tracker = active_inbound_connections.track_connection();
|
2021-10-27 14:28:51 -07:00
|
|
|
debug!(
|
2021-10-21 14:36:42 -07:00
|
|
|
inbound_connections = ?active_inbound_connections.update_count(),
|
|
|
|
"handshaking on an open inbound peer connection"
|
|
|
|
);
|
|
|
|
|
2021-05-17 13:49:16 -07:00
|
|
|
let connected_addr = peer::ConnectedAddr::new_inbound_direct(addr);
|
|
|
|
let accept_span = info_span!("listen_accept", peer = ?connected_addr);
|
|
|
|
let _guard = accept_span.enter();
|
|
|
|
|
|
|
|
debug!("got incoming connection");
|
2021-11-02 11:46:57 -07:00
|
|
|
handshaker.ready().await?;
|
2021-05-06 17:50:04 -07:00
|
|
|
// TODO: distinguish between proxied listeners and direct listeners
|
2021-05-17 13:49:16 -07:00
|
|
|
let handshaker_span = info_span!("listen_handshaker", peer = ?connected_addr);
|
2021-10-21 14:36:42 -07:00
|
|
|
|
2019-11-26 23:04:05 -08:00
|
|
|
// Construct a handshake future but do not drive it yet....
|
2021-10-21 14:36:42 -07:00
|
|
|
let handshake = handshaker.call(HandshakeRequest {
|
|
|
|
tcp_stream,
|
|
|
|
connected_addr,
|
|
|
|
connection_tracker,
|
|
|
|
});
|
2019-11-26 23:04:05 -08:00
|
|
|
// ... instead, spawn a new task to handle this connection
|
2021-10-21 14:36:42 -07:00
|
|
|
{
|
|
|
|
let mut peerset_tx = peerset_tx.clone();
|
2021-12-10 10:32:42 -08:00
|
|
|
|
|
|
|
let handshake_task = tokio::spawn(
|
2021-10-21 14:36:42 -07:00
|
|
|
async move {
|
2021-12-10 10:32:42 -08:00
|
|
|
let handshake_result = handshake.await;
|
|
|
|
|
|
|
|
if let Ok(client) = handshake_result {
|
2021-12-08 18:54:29 -08:00
|
|
|
let _ = peerset_tx.send(Ok((addr, client))).await;
|
2021-12-10 10:32:42 -08:00
|
|
|
} else {
|
|
|
|
debug!(?handshake_result, "error handshaking with inbound peer");
|
2021-10-21 14:36:42 -07:00
|
|
|
}
|
2021-05-17 13:49:16 -07:00
|
|
|
}
|
2021-10-21 14:36:42 -07:00
|
|
|
.instrument(handshaker_span),
|
|
|
|
);
|
2021-12-10 10:32:42 -08:00
|
|
|
|
|
|
|
handshakes.push(Box::pin(handshake_task));
|
2021-10-21 14:36:42 -07:00
|
|
|
}
|
2021-10-21 17:35:34 -07:00
|
|
|
|
|
|
|
// Only spawn one inbound connection handshake per `MIN_PEER_CONNECTION_INTERVAL`.
|
|
|
|
// But clear out failed connections as fast as possible.
|
|
|
|
//
|
|
|
|
// If there is a flood of connections,
|
|
|
|
// this stops Zebra overloading the network with handshake data.
|
|
|
|
//
|
|
|
|
// Zebra can't control how many queued connections are waiting,
|
|
|
|
// but most OSes also limit the number of queued inbound connections on a listener port.
|
|
|
|
tokio::time::sleep(constants::MIN_PEER_CONNECTION_INTERVAL).await;
|
2021-10-27 18:49:31 -07:00
|
|
|
} else {
|
|
|
|
debug!(?inbound_result, "error accepting inbound connection");
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
2021-10-27 14:28:51 -07:00
|
|
|
|
|
|
|
// Security: Let other tasks run after each connection is processed.
|
|
|
|
//
|
|
|
|
// Avoids remote peers starving other Zebra tasks using inbound connection successes or errors.
|
|
|
|
tokio::task::yield_now().await;
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
/// An action that the peer crawler can take.
|
|
|
|
#[allow(dead_code)]
|
|
|
|
enum CrawlerAction {
|
|
|
|
/// Drop the demand signal because there are too many pending handshakes.
|
|
|
|
DemandDrop,
|
|
|
|
/// Initiate a handshake to `candidate` in response to demand.
|
|
|
|
DemandHandshake { candidate: MetaAddr },
|
|
|
|
/// Crawl existing peers for more peers in response to demand, because there
|
|
|
|
/// are no available candidates.
|
|
|
|
DemandCrawl,
|
|
|
|
/// Crawl existing peers for more peers in response to a timer `tick`.
|
|
|
|
TimerCrawl { tick: Instant },
|
|
|
|
/// Handle a successfully connected handshake `peer_set_change`.
|
|
|
|
HandshakeConnected {
|
2021-12-08 18:54:29 -08:00
|
|
|
address: SocketAddr,
|
|
|
|
client: peer::Client,
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
},
|
|
|
|
/// Handle a handshake failure to `failed_addr`.
|
|
|
|
HandshakeFailed { failed_addr: MetaAddr },
|
|
|
|
}
|
|
|
|
|
|
|
|
/// Given a channel `demand_rx` that signals a need for new peers, try to find
|
|
|
|
/// and connect to new peers, and send the resulting `peer::Client`s through the
|
2021-10-21 14:36:42 -07:00
|
|
|
/// `peerset_tx` channel.
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
///
|
2021-10-27 14:28:51 -07:00
|
|
|
/// Crawl for new peers every `config.crawl_new_peer_interval`.
|
|
|
|
/// Also crawl whenever there is demand, but no new peers in `candidates`.
|
|
|
|
/// After crawling, try to connect to one new peer using `outbound_connector`.
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
///
|
|
|
|
/// If a handshake fails, restore the unused demand signal by sending it to
|
|
|
|
/// `demand_tx`.
|
|
|
|
///
|
2021-10-21 14:36:42 -07:00
|
|
|
/// The crawler terminates when `candidates.update()` or `peerset_tx` returns a
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
/// permanent internal error. Transient errors and individual peer errors should
|
|
|
|
/// be handled within the crawler.
|
2021-10-21 14:36:42 -07:00
|
|
|
///
|
2021-10-27 18:49:31 -07:00
|
|
|
/// Uses `active_outbound_connections` to limit the number of active outbound connections
|
|
|
|
/// across both the initial peers and crawler. The limit is based on `config`.
|
2021-10-27 14:28:51 -07:00
|
|
|
#[instrument(skip(
|
|
|
|
config,
|
|
|
|
demand_tx,
|
|
|
|
demand_rx,
|
|
|
|
candidates,
|
|
|
|
outbound_connector,
|
|
|
|
peerset_tx,
|
|
|
|
active_outbound_connections,
|
|
|
|
))]
|
2019-11-26 23:04:05 -08:00
|
|
|
async fn crawl_and_dial<C, S>(
|
2021-10-27 14:28:51 -07:00
|
|
|
config: Config,
|
2021-12-19 16:44:43 -08:00
|
|
|
mut demand_tx: futures::channel::mpsc::Sender<MorePeers>,
|
|
|
|
mut demand_rx: futures::channel::mpsc::Receiver<MorePeers>,
|
2019-11-26 23:04:05 -08:00
|
|
|
mut candidates: CandidateSet<S>,
|
2021-05-06 17:50:04 -07:00
|
|
|
outbound_connector: C,
|
2021-12-19 16:44:43 -08:00
|
|
|
mut peerset_tx: futures::channel::mpsc::Sender<DiscoveredPeer>,
|
2021-10-21 14:36:42 -07:00
|
|
|
mut active_outbound_connections: ActiveConnectionCounter,
|
2020-09-18 11:20:55 -07:00
|
|
|
) -> Result<(), BoxError>
|
2019-11-26 23:04:05 -08:00
|
|
|
where
|
2021-12-08 18:54:29 -08:00
|
|
|
C: Service<OutboundConnectorRequest, Response = (SocketAddr, peer::Client), Error = BoxError>
|
|
|
|
+ Clone
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
+ Send
|
|
|
|
+ 'static,
|
2019-11-26 23:04:05 -08:00
|
|
|
C::Future: Send + 'static,
|
2020-09-18 11:20:55 -07:00
|
|
|
S: Service<Request, Response = Response, Error = BoxError>,
|
2019-11-26 23:04:05 -08:00
|
|
|
S::Future: Send + 'static,
|
|
|
|
{
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
use CrawlerAction::*;
|
|
|
|
|
|
|
|
// CORRECTNESS
|
|
|
|
//
|
|
|
|
// To avoid hangs and starvation, the crawler must:
|
|
|
|
// - spawn a separate task for each crawl and handshake, so they can make
|
|
|
|
// progress independently (and avoid deadlocking each other)
|
|
|
|
// - use the `select!` macro for all actions, because the `select` function
|
|
|
|
// is biased towards the first ready future
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2022-01-04 15:43:30 -08:00
|
|
|
info!(
|
|
|
|
crawl_new_peer_interval = ?config.crawl_new_peer_interval,
|
|
|
|
outbound_connections = ?active_outbound_connections.update_count(),
|
|
|
|
"starting the peer address crawler",
|
|
|
|
);
|
|
|
|
|
2019-11-26 23:04:05 -08:00
|
|
|
let mut handshakes = FuturesUnordered::new();
|
2020-02-09 20:34:53 -08:00
|
|
|
// <FuturesUnordered as Stream> returns None when empty.
|
|
|
|
// Keeping an unresolved future in the pool means the stream
|
|
|
|
// never terminates.
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
// We could use StreamExt::select_next_some and StreamExt::fuse, but `fuse`
|
|
|
|
// prevents us from adding items to the stream and checking its length.
|
2020-02-09 20:34:53 -08:00
|
|
|
handshakes.push(future::pending().boxed());
|
2019-11-26 23:04:05 -08:00
|
|
|
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
let mut crawl_timer =
|
2021-11-02 11:46:57 -07:00
|
|
|
IntervalStream::new(tokio::time::interval(config.crawl_new_peer_interval))
|
|
|
|
.map(|tick| TimerCrawl { tick });
|
2019-11-26 23:04:05 -08:00
|
|
|
|
2020-02-09 20:34:53 -08:00
|
|
|
loop {
|
2021-01-11 18:28:56 -08:00
|
|
|
metrics::gauge!(
|
|
|
|
"crawler.in_flight_handshakes",
|
|
|
|
handshakes
|
|
|
|
.len()
|
|
|
|
.checked_sub(1)
|
|
|
|
.expect("the pool always contains an unresolved future") as f64
|
|
|
|
);
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
|
|
|
|
let crawler_action = tokio::select! {
|
2021-04-13 17:16:47 -07:00
|
|
|
next_handshake_res = handshakes.next() => next_handshake_res.expect(
|
|
|
|
"handshakes never terminates, because it contains a future that never resolves"
|
|
|
|
),
|
|
|
|
next_timer = crawl_timer.next() => next_timer.expect("timers never terminate"),
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
// turn the demand into an action, based on the crawler's current state
|
|
|
|
_ = demand_rx.next() => {
|
2021-10-27 14:28:51 -07:00
|
|
|
if active_outbound_connections.update_count() >= config.peerset_outbound_connection_limit() {
|
2021-10-27 18:49:31 -07:00
|
|
|
// Too many open outbound connections or pending handshakes already
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
DemandDrop
|
|
|
|
} else if let Some(candidate) = candidates.next().await {
|
|
|
|
// candidates.next has a short delay, and briefly holds the address
|
|
|
|
// book lock, so it shouldn't hang
|
|
|
|
DemandHandshake { candidate }
|
2020-02-09 20:34:53 -08:00
|
|
|
} else {
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
DemandCrawl
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
|
|
|
}
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
};
|
|
|
|
|
|
|
|
match crawler_action {
|
|
|
|
DemandDrop => {
|
|
|
|
// This is set to trace level because when the peerset is
|
|
|
|
// congested it can generate a lot of demand signal very
|
|
|
|
// rapidly.
|
2021-10-27 14:28:51 -07:00
|
|
|
trace!("too many open connections or in-flight handshakes, dropping demand signal");
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
}
|
|
|
|
DemandHandshake { candidate } => {
|
2021-10-21 14:36:42 -07:00
|
|
|
// Increment the connection count before we spawn the connection.
|
|
|
|
let outbound_connection_tracker = active_outbound_connections.track_connection();
|
2021-10-27 14:28:51 -07:00
|
|
|
debug!(
|
2021-10-21 14:36:42 -07:00
|
|
|
outbound_connections = ?active_outbound_connections.update_count(),
|
|
|
|
"opening an outbound peer connection"
|
|
|
|
);
|
|
|
|
|
|
|
|
// Spawn each handshake into an independent task, so it can make
|
|
|
|
// progress independently of the crawls.
|
|
|
|
let hs_join = tokio::spawn(dial(
|
|
|
|
candidate,
|
|
|
|
outbound_connector.clone(),
|
|
|
|
outbound_connection_tracker,
|
|
|
|
))
|
|
|
|
.map(move |res| match res {
|
|
|
|
Ok(crawler_action) => crawler_action,
|
|
|
|
Err(e) => {
|
|
|
|
panic!("panic during handshaking with {:?}: {:?} ", candidate, e);
|
|
|
|
}
|
|
|
|
})
|
|
|
|
.instrument(Span::current());
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
handshakes.push(Box::pin(hs_join));
|
|
|
|
}
|
|
|
|
DemandCrawl => {
|
|
|
|
debug!("demand for peers but no available candidates");
|
|
|
|
// update has timeouts, and briefly holds the address book
|
|
|
|
// lock, so it shouldn't hang
|
|
|
|
//
|
|
|
|
// TODO: refactor candidates into a buffered service, so we can
|
|
|
|
// spawn independent tasks to avoid deadlocks
|
2021-12-09 16:19:52 -08:00
|
|
|
let more_peers = candidates.update().await?;
|
|
|
|
|
|
|
|
// If we got more peers, try to connect to a new peer.
|
|
|
|
//
|
|
|
|
// # Security
|
|
|
|
//
|
|
|
|
// Update attempts are rate-limited by the candidate set.
|
|
|
|
//
|
|
|
|
// We only try peers if there was actually an update.
|
|
|
|
// So if all peers have had a recent attempt,
|
|
|
|
// and there was recent update with no peers,
|
|
|
|
// the channel will drain.
|
|
|
|
// This prevents useless update attempt loops.
|
|
|
|
if let Some(more_peers) = more_peers {
|
|
|
|
let _ = demand_tx.try_send(more_peers);
|
|
|
|
}
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
}
|
|
|
|
TimerCrawl { tick } => {
|
|
|
|
debug!(
|
|
|
|
?tick,
|
|
|
|
"crawling for more peers in response to the crawl timer"
|
|
|
|
);
|
|
|
|
// TODO: spawn independent tasks to avoid deadlocks
|
2020-02-09 20:34:53 -08:00
|
|
|
candidates.update().await?;
|
|
|
|
// Try to connect to a new peer.
|
2021-10-21 18:26:04 -07:00
|
|
|
let _ = demand_tx.try_send(MorePeers);
|
2020-02-09 20:34:53 -08:00
|
|
|
}
|
2021-12-08 18:54:29 -08:00
|
|
|
HandshakeConnected { address, client } => {
|
|
|
|
debug!(candidate.addr = ?address, "successfully dialed new peer");
|
|
|
|
// successes are handled by an independent task, except for `candidates.update` in
|
|
|
|
// this task, which has a timeout, so they shouldn't hang
|
|
|
|
peerset_tx.send(Ok((address, client))).await?;
|
2020-02-09 20:34:53 -08:00
|
|
|
}
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
HandshakeFailed { failed_addr } => {
|
2021-10-21 14:36:42 -07:00
|
|
|
// The connection was never opened, or it failed the handshake and was dropped.
|
|
|
|
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
debug!(?failed_addr.addr, "marking candidate as failed");
|
2021-12-19 16:44:43 -08:00
|
|
|
candidates.report_failed(&failed_addr).await;
|
2020-02-09 20:34:53 -08:00
|
|
|
// The demand signal that was taken out of the queue
|
|
|
|
// to attempt to connect to the failed candidate never
|
|
|
|
// turned into a connection, so add it back:
|
2021-12-09 16:19:52 -08:00
|
|
|
//
|
|
|
|
// Security: handshake failures are rate-limited by peer attempt timeouts.
|
2021-10-21 18:26:04 -07:00
|
|
|
let _ = demand_tx.try_send(MorePeers);
|
2020-02-09 20:34:53 -08:00
|
|
|
}
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
2021-10-27 14:28:51 -07:00
|
|
|
|
|
|
|
// Security: Let other tasks run after each crawler action is processed.
|
|
|
|
//
|
|
|
|
// Avoids remote peers starving other Zebra tasks using outbound connection errors.
|
|
|
|
tokio::task::yield_now().await;
|
2019-11-26 23:04:05 -08:00
|
|
|
}
|
|
|
|
}
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
|
2021-05-06 17:50:04 -07:00
|
|
|
/// Try to connect to `candidate` using `outbound_connector`.
|
2021-10-21 14:36:42 -07:00
|
|
|
/// Uses `outbound_connection_tracker` to track the active connection count.
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
///
|
|
|
|
/// Returns a `HandshakeConnected` action on success, and a
|
|
|
|
/// `HandshakeFailed` action on error.
|
2021-10-21 14:36:42 -07:00
|
|
|
#[instrument(skip(outbound_connector, outbound_connection_tracker))]
|
|
|
|
async fn dial<C>(
|
|
|
|
candidate: MetaAddr,
|
|
|
|
mut outbound_connector: C,
|
|
|
|
outbound_connection_tracker: ConnectionTracker,
|
|
|
|
) -> CrawlerAction
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
where
|
2021-12-08 18:54:29 -08:00
|
|
|
C: Service<OutboundConnectorRequest, Response = (SocketAddr, peer::Client), Error = BoxError>
|
|
|
|
+ Clone
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
+ Send
|
|
|
|
+ 'static,
|
|
|
|
C::Future: Send + 'static,
|
|
|
|
{
|
|
|
|
// CORRECTNESS
|
|
|
|
//
|
|
|
|
// To avoid hangs, the dialer must only await:
|
|
|
|
// - functions that return immediately, or
|
|
|
|
// - functions that have a reasonable timeout
|
|
|
|
|
|
|
|
debug!(?candidate.addr, "attempting outbound connection in response to demand");
|
|
|
|
|
|
|
|
// the connector is always ready, so this can't hang
|
2021-05-06 17:50:04 -07:00
|
|
|
let outbound_connector = outbound_connector
|
2021-11-02 11:46:57 -07:00
|
|
|
.ready()
|
2021-05-06 17:50:04 -07:00
|
|
|
.await
|
|
|
|
.expect("outbound connector never errors");
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
|
2021-10-21 14:36:42 -07:00
|
|
|
let req = OutboundConnectorRequest {
|
|
|
|
addr: candidate.addr,
|
|
|
|
connection_tracker: outbound_connection_tracker,
|
|
|
|
};
|
|
|
|
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
// the handshake has timeouts, so it shouldn't hang
|
2021-05-06 17:50:04 -07:00
|
|
|
outbound_connector
|
2021-10-21 14:36:42 -07:00
|
|
|
.call(req)
|
2021-04-13 00:46:17 -07:00
|
|
|
.map_err(|e| (candidate, e))
|
|
|
|
.map(Into::into)
|
|
|
|
.await
|
|
|
|
}
|
|
|
|
|
2021-12-08 18:54:29 -08:00
|
|
|
impl From<Result<(SocketAddr, peer::Client), (MetaAddr, BoxError)>> for CrawlerAction {
|
|
|
|
fn from(dial_result: Result<(SocketAddr, peer::Client), (MetaAddr, BoxError)>) -> Self {
|
2021-04-13 00:46:17 -07:00
|
|
|
use CrawlerAction::*;
|
|
|
|
match dial_result {
|
2021-12-08 18:54:29 -08:00
|
|
|
Ok((address, client)) => HandshakeConnected { address, client },
|
2021-04-13 00:46:17 -07:00
|
|
|
Err((candidate, e)) => {
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
debug!(?candidate.addr, ?e, "failed to connect to candidate");
|
|
|
|
HandshakeFailed {
|
|
|
|
failed_addr: candidate,
|
|
|
|
}
|
|
|
|
}
|
2021-04-13 00:46:17 -07:00
|
|
|
}
|
|
|
|
}
|
Fix a deadlock between the crawler and dialer, and other hangs (#1950)
* Stop ignoring inbound message errors and handshake timeouts
To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
(not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout
Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.
* Avoid hangs by adding a timeout to the candidate set update
Also increase the fanout from 1 to 2, to increase address diversity.
But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.
Also log Peers response errors in the CandidateSet.
* Use the select macro in the crawler to reduce hangs
The `select` function is biased towards its first argument, risking
starvation.
As a side-benefit, this change also makes the code a lot easier to read
and maintain.
* Split CrawlerAction::Demand into separate actions
This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.
That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.
* Spawn a separate task for each handshake
This change avoids deadlocks by letting each handshake make progress
independently.
* Move the dial task into a separate function
This refactor improves readability.
* Fix buggy future::select function usage
And document the correctness of the new code.
2021-04-07 06:25:10 -07:00
|
|
|
}
|