zebra/zebra-network/src/peer_set/candidate_set.rs

//! Candidate peer selection for outbound connections using the [`CandidateSet`].

use std::{cmp::min, sync::Arc};

use chrono::Utc;
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::time::{sleep_until, timeout, Instant};
use tower::{Service, ServiceExt};
use tracing::Span;

use zebra_chain::serialization::DateTime32;

use crate::{
    constants, meta_addr::MetaAddrChange, peer_set::set::MorePeers, types::MetaAddr, AddressBook,
    BoxError, Request, Response,
};

#[cfg(test)]
mod tests;

/// The [`CandidateSet`] manages outbound peer connection attempts. Successful
/// connections become peers in the [`PeerSet`](super::set::PeerSet).
///
/// The candidate set divides the set of all possible outbound peers into
/// disjoint subsets, using the [`PeerAddrState`](crate::PeerAddrState):
///
/// 1. [`Responded`] peers, which we have had an outbound connection to.
/// 2. [`NeverAttemptedGossiped`] peers, which we learned about from other peers
///    but have never connected to.
/// 3. [`NeverAttemptedAlternate`] peers, canonical addresses which we learned
///    from the [`Version`] messages of inbound and outbound connections,
///    but have never connected to.
/// 4. [`Failed`] peers, which failed a connection attempt, or had an error
///    during an outbound connection.
/// 5. [`AttemptPending`] peers, which we've recently queued for a connection.
///
/// Never attempted peers are always available for connection.
///
/// If a peer's attempted, responded, or failure time is recent
/// (within the liveness limit), we avoid reconnecting to it.
/// Otherwise, we assume that it has disconnected or hung,
/// and attempt reconnection.
///
/// ```ascii,no_run
///                         ┌──────────────────┐
///                         │   Config / DNS   │
///             ┌───────────│       Seed       │───────────┐
///             │           │    Addresses     │           │
///             │           └──────────────────┘           │
///             │                    │ untrusted_last_seen │
///             │                    │     is unknown      │
///             ▼                    │                     ▼
///    ┌──────────────────┐          │          ┌──────────────────┐
///    │    Handshake     │          │          │     Peer Set     │
///    │    Canonical     │──────────┼──────────│     Gossiped     │
///    │    Addresses     │          │          │    Addresses     │
///    └──────────────────┘          │          └──────────────────┘
///     untrusted_last_seen          │                provides
///         set to now               │           untrusted_last_seen
///                                  ▼
///                                  Λ   if attempted, responded, or failed:
///                                 ╱ ╲         ignore gossiped info
///                                ▕   ▏    otherwise, if never attempted:
///                                 ╲ ╱    skip updates to existing fields
///                                  V
///  ┌───────────────────────────────┼───────────────────────────────┐
///  │ AddressBook                   │                               │
///  │ disjoint `PeerAddrState`s     ▼                               │
///  │ ┌─────────────┐  ┌─────────────────────────┐  ┌─────────────┐ │
///  │ │ `Responded` │  │`NeverAttemptedGossiped` │  │  `Failed`   │ │
/// ┌┼▶│    Peers    │  │`NeverAttemptedAlternate`│  │   Peers     │◀┼┐
/// ││ │             │  │          Peers          │  │             │ ││
/// ││ └─────────────┘  └─────────────────────────┘  └─────────────┘ ││
/// ││        │                      │                      │        ││
/// ││ #1 oldest_first        #2 newest_first        #3 oldest_first ││
/// ││        ├──────────────────────┴──────────────────────┘        ││
/// ││        ▼                                                      ││
/// ││        Λ                                                      ││
/// ││       ╱ ╲              filter by                              ││
/// ││      ▕   ▏        is_ready_for_connection_attempt             ││
/// ││       ╲ ╱    to remove recent `Responded`,                    ││
/// ││        V  `AttemptPending`, and `Failed` peers                ││
/// ││        │                                                      ││
/// ││        │    try outbound connection,                          ││
/// ││        ▼  update last_attempt to now()                        ││
/// ││┌────────────────┐                                             ││
/// │││`AttemptPending`│                                             ││
/// │││     Peers      │                                             ││
/// ││└────────────────┘                                             ││
/// │└────────┼──────────────────────────────────────────────────────┘│
/// │         ▼                                                       │
/// │         Λ                                                       │
/// │        ╱ ╲                                                      │
/// │       ▕   ▏─────────────────────────────────────────────────────┘
/// │        ╲ ╱   connection failed, update last_failure to now()
/// │         V
/// │         │
/// │         │ connection succeeded
/// │         ▼
/// │  ┌────────────┐
/// │  │    send    │
/// │  │peer::Client│
/// │  │to Discover │
/// │  └────────────┘
/// │         │
/// │         ▼
/// │┌───────────────────────────────────────┐
/// ││ every time we receive a peer message: │
/// └│  * update state to `Responded`        │
///  │  * update last_response to now()      │
///  └───────────────────────────────────────┘
/// ```
///
/// [`Responded`]: crate::PeerAddrState::Responded
/// [`Version`]: crate::protocol::external::types::Version
/// [`NeverAttemptedGossiped`]: crate::PeerAddrState::NeverAttemptedGossiped
/// [`NeverAttemptedAlternate`]: crate::PeerAddrState::NeverAttemptedAlternate
/// [`Failed`]: crate::PeerAddrState::Failed
/// [`AttemptPending`]: crate::PeerAddrState::AttemptPending
// TODO:
//   * show all possible transitions between Attempt/Responded/Failed,
//     except Failed -> Responded is invalid, must go through Attempt
//   * for now, seed peers go straight to handshaking and responded,
//     but we'll fix that once we add the Seed state
// When we add the Seed state:
//   * show that seed peers that transition to other never attempted
//     states are already in the address book
pub(crate) struct CandidateSet<S> {
    // Correctness: the address book must be private,
    //              so all operations are performed on a blocking thread (see #1976).
    address_book: Arc<std::sync::Mutex<AddressBook>>,
    peer_service: S,
    min_next_handshake: Instant,
    min_next_crawl: Instant,
}

impl<S> CandidateSet<S>
where
    S: Service<Request, Response = Response, Error = BoxError>,
    S::Future: Send + 'static,
{
    /// Uses `address_book` and `peer_service` to manage a [`CandidateSet`] of peers.
    pub fn new(
        address_book: Arc<std::sync::Mutex<AddressBook>>,
        peer_service: S,
    ) -> CandidateSet<S> {
        CandidateSet {
            address_book,
            peer_service,
            min_next_handshake: Instant::now(),
            min_next_crawl: Instant::now(),
        }
    }

    /// Update the peer set from the network, using the default fanout limit.
    ///
    /// See [`update_initial`][Self::update_initial] for details.
    pub async fn update(&mut self) -> Result<Option<MorePeers>, BoxError> {
        self.update_timeout(None).await
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`.
    ///
    /// - Ask a few live [`Responded`] peers to send us more peers.
    /// - Process all completed peer responses, adding new peers in the
    ///   [`NeverAttemptedGossiped`] state.
    ///
    /// Returns `Some(MorePeers)` if the crawl was successful and the crawler
    /// should ask for more peers. Returns `None` if there are no new peers.
    ///
    /// ## Correctness
    ///
    /// Pass the initial peer set size as `fanout_limit` during initialization,
    /// so that Zebra does not send duplicate requests to the same peer.
    ///
    /// The crawler exits when update returns an error, so it must only return
    /// errors on permanent failures.
    ///
    /// The handshaker sets up the peer message receiver so it also sends a
    /// [`Responded`] peer address update.
    ///
    /// [`report_failed`][Self::report_failed] puts peers into the [`Failed`] state.
    ///
    /// [`next`][Self::next] puts peers into the [`AttemptPending`] state.
    ///
    /// ## Security
    ///
    /// This call is rate-limited to prevent sending a burst of repeated requests for new peer
    /// addresses. Each call will only update the [`CandidateSet`] if more time
    /// than [`MIN_PEER_GET_ADDR_INTERVAL`][constants::MIN_PEER_GET_ADDR_INTERVAL] has passed since
    /// the last call. Otherwise, the update is skipped.
    ///
    /// [`Responded`]: crate::PeerAddrState::Responded
    /// [`NeverAttemptedGossiped`]: crate::PeerAddrState::NeverAttemptedGossiped
    /// [`Failed`]: crate::PeerAddrState::Failed
    /// [`AttemptPending`]: crate::PeerAddrState::AttemptPending
    pub async fn update_initial(
        &mut self,
        fanout_limit: usize,
    ) -> Result<Option<MorePeers>, BoxError> {
        self.update_timeout(Some(fanout_limit)).await
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`, and imposing a timeout on the entire fanout.
    ///
    /// See [`update_initial`][Self::update_initial] for details.
    async fn update_timeout(
        &mut self,
        fanout_limit: Option<usize>,
    ) -> Result<Option<MorePeers>, BoxError> {
        let mut more_peers = None;

        // SECURITY
        //
        // Rate limit sending `GetAddr` messages to peers.
        if self.min_next_crawl <= Instant::now() {
            // CORRECTNESS
            //
            // Use a timeout to avoid deadlocks when there are no connected
            // peers, and:
            // - we're waiting on a handshake to complete so there are peers, or
            // - another task that handles or adds peers is waiting on this task
            //   to complete.
            if let Ok(fanout_result) = timeout(
                constants::PEER_GET_ADDR_TIMEOUT,
                self.update_fanout(fanout_limit),
            )
            .await
            {
                more_peers = fanout_result?;
            } else {
                // update must only return an error for permanent failures
                info!("timeout waiting for peer service readiness or peer responses");
            }

            self.min_next_crawl = Instant::now() + constants::MIN_PEER_GET_ADDR_INTERVAL;
        }

        Ok(more_peers)
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`.
    ///
    /// Opportunistically crawl the network on every update call to ensure
    /// we're actively fetching peers. Continue independently of whether we
    /// actually receive any peers, but always ask the network for more.
    ///
    /// Because requests are load-balanced across existing peers, we can make
    /// multiple requests concurrently, which will be randomly assigned to
    /// existing peers, but we don't make too many because update may be
    /// called while the peer set is already loaded.
    ///
    /// See [`update_initial`][Self::update_initial] for more details.
    ///
    /// # Correctness
    ///
    /// This function does not have a timeout.
    /// Use [`update_timeout`][Self::update_timeout] instead.
    async fn update_fanout(
        &mut self,
        fanout_limit: Option<usize>,
    ) -> Result<Option<MorePeers>, BoxError> {
        let fanout_limit = fanout_limit
            .map(|fanout_limit| min(fanout_limit, constants::GET_ADDR_FANOUT))
            .unwrap_or(constants::GET_ADDR_FANOUT);
        debug!(?fanout_limit, "sending GetPeers requests");

        let mut responses = FuturesUnordered::new();
        let mut more_peers = None;

        // Launch requests
        for attempt in 0..fanout_limit {
            if attempt > 0 {
                // Let other tasks run, so we're more likely to choose a different peer.
                //
                // TODO: move fanouts into the PeerSet, so we always choose different peers (#2214)
                tokio::task::yield_now().await;
            }

            let peer_service = self.peer_service.ready().await?;
            responses.push(peer_service.call(Request::Peers));
        }

        let mut address_book_updates = FuturesUnordered::new();

        // Process responses
        while let Some(rsp) = responses.next().await {
            match rsp {
                Ok(Response::Peers(addrs)) => {
                    trace!(
                        addr_count = ?addrs.len(),
                        ?addrs,
                        "got response to GetPeers"
                    );
                    let addrs = validate_addrs(addrs, DateTime32::now());
                    address_book_updates.push(self.send_addrs(addrs));
                    more_peers = Some(MorePeers);
                }
                Err(e) => {
                    // since we do a fanout, and new updates are triggered by
                    // each demand, we can ignore errors in individual responses
                    trace!(?e, "got error in GetPeers request");
                }
                Ok(_) => unreachable!("Peers requests always return Peers responses"),
            }
        }

        // Wait until all the address book updates have finished
        while let Some(()) = address_book_updates.next().await {}

        Ok(more_peers)
    }

    /// Add new `addrs` to the address book.
    async fn send_addrs(&self, addrs: impl IntoIterator<Item = MetaAddr>) {
        // # Security
        //
        // New gossiped peers are rate-limited because:
        // - Zebra initiates requests for new gossiped peers
        // - the fanout is limited
        // - the number of addresses per peer is limited
        let addrs: Vec<MetaAddrChange> = addrs
            .into_iter()
            .map(MetaAddr::new_gossiped_change)
            .map(|maybe_addr| maybe_addr.expect("Received gossiped peers always have services set"))
            .collect();

        debug!(count = ?addrs.len(), "sending gossiped addresses to the address book");

        // Don't bother spawning a task if there are no addresses left.
        if addrs.is_empty() {
            return;
        }

        // # Correctness
        //
        // Spawn address book accesses on a blocking thread,
        // to avoid deadlocks (see #1976).
        //
        // Extend handles duplicate addresses internally.
        let address_book = self.address_book.clone();
        let span = Span::current();
        tokio::task::spawn_blocking(move || {
            span.in_scope(|| address_book.lock().unwrap().extend(addrs))
        })
        .await
        .expect("panic in new peers address book update task");
    }

    /// Returns the next candidate for a connection attempt, if any are available.
    ///
    /// Returns peers in reconnection order, based on
    /// [`AddressBook::reconnection_peers`].
    ///
    /// Skips peers that have recently been active, attempted, or failed.
    ///
    /// ## Correctness
    ///
    /// `AttemptPending` peers will become [`Responded`] if they respond, or
    /// become `Failed` if they time out or provide a bad response.
    ///
    /// Live [`Responded`] peers will stay live if they keep responding, or
    /// become a reconnection candidate if they stop responding.
    ///
    /// ## Security
    ///
    /// Zebra resists distributed denial of service attacks by making sure that
    /// new peer connections are initiated at least
    /// [`MIN_OUTBOUND_PEER_CONNECTION_INTERVAL`][constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL]
    /// apart.
    ///
    /// [`Responded`]: crate::PeerAddrState::Responded
    pub async fn next(&mut self) -> Option<MetaAddr> {
        // Correctness: To avoid hangs, computation in the critical section should be kept to a minimum.
        let address_book = self.address_book.clone();
        let next_peer = move || -> Option<MetaAddr> {
            let mut guard = address_book.lock().unwrap();

            // Now we have the lock, get the current time
            let instant_now = std::time::Instant::now();
            let chrono_now = Utc::now();

            // It's okay to return without sleeping here, because we're returning
            // `None`. We only need to sleep before yielding an address.
            let next_peer = guard.reconnection_peers(instant_now, chrono_now).next()?;

            // TODO: only mark the peer as AttemptPending when it is actually used (#1976)
            //
            // If the future is dropped before `next` returns, the peer will be marked as AttemptPending,
            // even if its address is not actually used for a connection.
            //
            // We could send a reconnect change to the AddressBookUpdater when the peer is actually used,
            // but channel order is not guaranteed, so we could accidentally re-use the same peer.
            let next_peer = MetaAddr::new_reconnect(next_peer.addr);
            guard.update(next_peer)
        };

        // Correctness: Spawn address book accesses on a blocking thread, to avoid deadlocks (see #1976).
        let span = Span::current();
        let next_peer = tokio::task::spawn_blocking(move || span.in_scope(next_peer))
            .await
            .expect("panic in next peer address book task")?;

        // Security: rate-limit new outbound peer connections
        sleep_until(self.min_next_handshake).await;
        self.min_next_handshake = Instant::now() + constants::MIN_OUTBOUND_PEER_CONNECTION_INTERVAL;

        Some(next_peer)
    }

    /// Mark `addr` as a failed peer.
    pub async fn report_failed(&mut self, addr: &MetaAddr) {
        let addr = MetaAddr::new_errored(addr.addr, addr.services);

        // # Correctness
        //
        // Spawn address book accesses on a blocking thread,
        // to avoid deadlocks (see #1976).
        let address_book = self.address_book.clone();
        let span = Span::current();
        tokio::task::spawn_blocking(move || {
            span.in_scope(|| address_book.lock().unwrap().update(addr))
        })
        .await
        .expect("panic in peer failure address book update task");
    }
}

/// Check new `addrs` before adding them to the address book.
///
/// `last_seen_limit` is the maximum permitted last seen time, typically
/// [`Utc::now`].
///
/// If the data in an address is invalid, this function can:
/// - modify the address data, or
/// - delete the address.
///
/// # Security
///
/// Adjusts untrusted last seen times so they are not in the future. This stops
/// malicious peers keeping all their addresses at the front of the connection
/// queue. Honest peers with future clock skew also get adjusted.
///
/// Rejects all addresses if any calculated times overflow or underflow.
fn validate_addrs(
    addrs: impl IntoIterator<Item = MetaAddr>,
    last_seen_limit: DateTime32,
) -> impl Iterator<Item = MetaAddr> {
    // Note: The address book handles duplicate addresses internally,
    // so we don't need to de-duplicate addresses here.

    // TODO:
    // We should eventually implement these checks in this function:
    // - Zebra should ignore peers that are older than 3 weeks (part of #1865)
    //   - Zebra should count back 3 weeks from the newest peer timestamp sent
    //     by the other peer, to compensate for clock skew
    // - Zebra should limit the number of addresses it uses from a single Addrs
    //   response (#1869)

    let mut addrs: Vec<_> = addrs.into_iter().collect();

    limit_last_seen_times(&mut addrs, last_seen_limit);

    addrs.into_iter()
}

/// Ensure all reported `last_seen` times are less than or equal to `last_seen_limit`.
///
/// This will consider all addresses as invalid if trying to offset their
/// `last_seen` times to be before the limit causes an underflow.
fn limit_last_seen_times(addrs: &mut Vec<MetaAddr>, last_seen_limit: DateTime32) {
    let last_seen_times = addrs.iter().map(|meta_addr| {
        meta_addr
            .untrusted_last_seen()
            .expect("unexpected missing last seen: should be provided by deserialization")
    });
    let oldest_seen = last_seen_times.clone().min().unwrap_or(DateTime32::MIN);
    let newest_seen = last_seen_times.max().unwrap_or(DateTime32::MAX);

    // If any time is in the future, adjust all times, to compensate for clock skew on honest peers
    if newest_seen > last_seen_limit {
        let offset = newest_seen
            .checked_duration_since(last_seen_limit)
            .expect("unexpected underflow: just checked newest_seen is greater");

        // Check for underflow
        if oldest_seen.checked_sub(offset).is_some() {
            // No underflow is possible, so apply offset to all addresses
            for addr in addrs {
                let last_seen = addr
                    .untrusted_last_seen()
                    .expect("unexpected missing last seen: should be provided by deserialization");
                let last_seen = last_seen
                    .checked_sub(offset)
                    .expect("unexpected underflow: just checked oldest_seen");

                addr.set_untrusted_last_seen(last_seen);
            }
        } else {
            // An underflow will occur, so reject all gossiped peers
            addrs.clear();
        }
    }
}