zebra/zebra-network/src/peer_set/candidate_set.rs

use std::{cmp::min, sync::Arc};

use chrono::Utc;
use futures::stream::{FuturesUnordered, StreamExt};
use tokio::time::{sleep_until, timeout, Instant};
use tower::{Service, ServiceExt};

use zebra_chain::serialization::DateTime32;

use crate::{constants, types::MetaAddr, AddressBook, BoxError, Request, Response};

#[cfg(test)]
mod tests;

/// The [`CandidateSet`] manages outbound peer connection attempts.
/// Successful connections become peers in the [`PeerSet`].
///
/// The candidate set divides the set of all possible outbound peers into
/// disjoint subsets, using the [`PeerAddrState`]:
///
/// 1. [`Responded`] peers, which we have had an outbound connection to.
/// 2. [`NeverAttemptedGossiped`] peers, which we learned about from other peers
///    but have never connected to.
/// 3. [`NeverAttemptedAlternate`] peers, canonical addresses which we learned
///    from the [`Version`] messages of inbound and outbound connections,
///    but have never connected to.
/// 4. [`Failed`] peers, which failed a connection attempt, or had an error
///    during an outbound connection.
/// 5. [`AttemptPending`] peers, which we've recently queued for a connection.
///
/// Never attempted peers are always available for connection.
///
/// If a peer's attempted, responded, or failure time is recent
/// (within the liveness limit), we avoid reconnecting to it.
/// Otherwise, we assume that it has disconnected or hung,
/// and attempt reconnection.
///
/// ```ascii,no_run
///                         ┌──────────────────┐
///                         │   Config / DNS   │
///             ┌───────────│       Seed       │───────────┐
///             │           │    Addresses     │           │
///             │           └──────────────────┘           │
///             │                    │ untrusted_last_seen │
///             │                    │     is unknown      │
///             ▼                    │                     ▼
///    ┌──────────────────┐          │          ┌──────────────────┐
///    │    Handshake     │          │          │     Peer Set     │
///    │    Canonical     │──────────┼──────────│     Gossiped     │
///    │    Addresses     │          │          │    Addresses     │
///    └──────────────────┘          │          └──────────────────┘
///     untrusted_last_seen          │                provides
///         set to now               │           untrusted_last_seen
///                                  ▼
///                                  Λ   if attempted, responded, or failed:
///                                 ╱ ╲         ignore gossiped info
///                                ▕   ▏    otherwise, if never attempted:
///                                 ╲ ╱    skip updates to existing fields
///                                  V
///  ┌───────────────────────────────┼───────────────────────────────┐
///  │ AddressBook                   │                               │
///  │ disjoint `PeerAddrState`s     ▼                               │
///  │ ┌─────────────┐  ┌─────────────────────────┐  ┌─────────────┐ │
///  │ │ `Responded` │  │`NeverAttemptedGossiped` │  │  `Failed`   │ │
/// ┌┼▶│    Peers    │  │`NeverAttemptedAlternate`│  │   Peers     │◀┼┐
/// ││ │             │  │          Peers          │  │             │ ││
/// ││ └─────────────┘  └─────────────────────────┘  └─────────────┘ ││
/// ││        │                      │                      │        ││
/// ││ #1 oldest_first        #2 newest_first        #3 oldest_first ││
/// ││        ├──────────────────────┴──────────────────────┘        ││
/// ││        ▼                                                      ││
/// ││        Λ                                                      ││
/// ││       ╱ ╲              filter by                              ││
/// ││      ▕   ▏        is_ready_for_connection_attempt             ││
/// ││       ╲ ╱    to remove recent `Responded`,                    ││
/// ││        V  `AttemptPending`, and `Failed` peers                ││
/// ││        │                                                      ││
/// ││        │    try outbound connection,                          ││
/// ││        ▼  update last_attempt to now()                        ││
/// ││┌────────────────┐                                             ││
/// │││`AttemptPending`│                                             ││
/// │││     Peers      │                                             ││
/// ││└────────────────┘                                             ││
/// │└────────┼──────────────────────────────────────────────────────┘│
/// │         ▼                                                       │
/// │         Λ                                                       │
/// │        ╱ ╲                                                      │
/// │       ▕   ▏─────────────────────────────────────────────────────┘
/// │        ╲ ╱   connection failed, update last_failure to now()
/// │         V
/// │         │
/// │         │ connection succeeded
/// │         ▼
/// │  ┌────────────┐
/// │  │    send    │
/// │  │peer::Client│
/// │  │to Discover │
/// │  └────────────┘
/// │         │
/// │         ▼
/// │┌───────────────────────────────────────┐
/// ││ every time we receive a peer message: │
/// └│  * update state to `Responded`        │
///  │  * update last_response to now()      │
///  └───────────────────────────────────────┘
/// ```
// TODO:
//   * show all possible transitions between Attempt/Responded/Failed,
//     except Failed -> Responded is invalid, must go through Attempt
//   * for now, seed peers go straight to handshaking and responded,
//     but we'll fix that once we add the Seed state
// When we add the Seed state:
//   * show that seed peers that transition to other never attempted
//     states are already in the address book
pub(crate) struct CandidateSet<S> {
    pub(super) address_book: Arc<std::sync::Mutex<AddressBook>>,
    pub(super) peer_service: S,
    min_next_handshake: Instant,
    min_next_crawl: Instant,
}

impl<S> CandidateSet<S>
where
    S: Service<Request, Response = Response, Error = BoxError>,
    S::Future: Send + 'static,
{
    /// Uses `address_book` and `peer_service` to manage a [`CandidateSet`] of peers.
    pub fn new(
        address_book: Arc<std::sync::Mutex<AddressBook>>,
        peer_service: S,
    ) -> CandidateSet<S> {
        CandidateSet {
            address_book,
            peer_service,
            min_next_handshake: Instant::now(),
            min_next_crawl: Instant::now(),
        }
    }

    /// Update the peer set from the network, using the default fanout limit.
    ///
    /// See [`update_initial`][Self::update_initial] for details.
    pub async fn update(&mut self) -> Result<(), BoxError> {
        self.update_timeout(None).await
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`.
    ///
    /// - Ask a few live [`Responded`] peers to send us more peers.
    /// - Process all completed peer responses, adding new peers in the
    ///   [`NeverAttemptedGossiped`] state.
    ///
    /// ## Correctness
    ///
    /// Pass the initial peer set size as `fanout_limit` during initialization,
    /// so that Zebra does not send duplicate requests to the same peer.
    ///
    /// The crawler exits when update returns an error, so it must only return
    /// errors on permanent failures.
    ///
    /// The handshaker sets up the peer message receiver so it also sends a
    /// [`Responded`] peer address update.
    ///
    /// [`report_failed`][Self::report_failed] puts peers into the [`Failed`] state.
    ///
    /// [`next`][Self::next] puts peers into the [`AttemptPending`] state.
    ///
    /// ## Security
    ///
    /// This call is rate-limited to prevent sending a burst of repeated requests for new peer
    /// addresses. Each call will only update the [`CandidateSet`] if more time
    /// than [`MIN_PEER_GET_ADDR_INTERVAL`][constants::MIN_PEER_GET_ADDR_INTERVAL] has passed since
    /// the last call. Otherwise, the update is skipped.
    ///
    /// [`Responded`]: crate::PeerAddrState::Responded
    /// [`NeverAttemptedGossiped`]: crate::PeerAddrState::NeverAttemptedGossiped
    /// [`Failed`]: crate::PeerAddrState::Failed
    /// [`AttemptPending`]: crate::PeerAddrState::AttemptPending
    pub async fn update_initial(&mut self, fanout_limit: usize) -> Result<(), BoxError> {
        self.update_timeout(Some(fanout_limit)).await
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`, and imposing a timeout on the entire fanout.
    ///
    /// See [`update_initial`][Self::update_initial] for details.
    async fn update_timeout(&mut self, fanout_limit: Option<usize>) -> Result<(), BoxError> {
        // SECURITY
        //
        // Rate limit sending `GetAddr` messages to peers.
        if self.min_next_crawl <= Instant::now() {
            // CORRECTNESS
            //
            // Use a timeout to avoid deadlocks when there are no connected
            // peers, and:
            // - we're waiting on a handshake to complete so there are peers, or
            // - another task that handles or adds peers is waiting on this task
            //   to complete.
            if let Ok(fanout_result) = timeout(
                constants::PEER_GET_ADDR_TIMEOUT,
                self.update_fanout(fanout_limit),
            )
            .await
            {
                fanout_result?;
            } else {
                // update must only return an error for permanent failures
                info!("timeout waiting for peer service readiness or peer responses");
            }

            self.min_next_crawl = Instant::now() + constants::MIN_PEER_GET_ADDR_INTERVAL;
        }

        Ok(())
    }

    /// Update the peer set from the network, limiting the fanout to
    /// `fanout_limit`.
    ///
    /// See [`update_initial`][Self::update_initial]  for details.
    ///
    /// # Correctness
    ///
    /// This function does not have a timeout.
    /// Use [`update_timeout`][Self::update_timeout] instead.
    async fn update_fanout(&mut self, fanout_limit: Option<usize>) -> Result<(), BoxError> {
        // Opportunistically crawl the network on every update call to ensure
        // we're actively fetching peers. Continue independently of whether we
        // actually receive any peers, but always ask the network for more.
        //
        // Because requests are load-balanced across existing peers, we can make
        // multiple requests concurrently, which will be randomly assigned to
        // existing peers, but we don't make too many because update may be
        // called while the peer set is already loaded.
        let mut responses = FuturesUnordered::new();
        let fanout_limit = fanout_limit
            .map(|fanout_limit| min(fanout_limit, constants::GET_ADDR_FANOUT))
            .unwrap_or(constants::GET_ADDR_FANOUT);
        debug!(?fanout_limit, "sending GetPeers requests");
        // TODO: launch each fanout in its own task (might require tokio 1.6)
        for _ in 0..fanout_limit {
            let peer_service = self.peer_service.ready().await?;
            responses.push(peer_service.call(Request::Peers));
        }
        while let Some(rsp) = responses.next().await {
            match rsp {
                Ok(Response::Peers(addrs)) => {
                    trace!(
                        addr_count = ?addrs.len(),
                        ?addrs,
                        "got response to GetPeers"
                    );
                    let addrs = validate_addrs(addrs, DateTime32::now());
                    self.send_addrs(addrs);
                }
                Err(e) => {
                    // since we do a fanout, and new updates are triggered by
                    // each demand, we can ignore errors in individual responses
                    trace!(?e, "got error in GetPeers request");
                }
                Ok(_) => unreachable!("Peers requests always return Peers responses"),
            }
        }

        Ok(())
    }

    /// Add new `addrs` to the address book.
    fn send_addrs(&self, addrs: impl IntoIterator<Item = MetaAddr>) {
        let addrs = addrs
            .into_iter()
            .map(MetaAddr::new_gossiped_change)
            .map(|maybe_addr| {
                maybe_addr.expect("Received gossiped peers always have services set")
            });

        // # Correctness
        //
        // Briefly hold the address book threaded mutex, to extend
        // the address list.
        //
        // Extend handles duplicate addresses internally.
        self.address_book.lock().unwrap().extend(addrs);
    }

    /// Returns the next candidate for a connection attempt, if any are available.
    ///
    /// Returns peers in reconnection order, based on
    /// [`AddressBook::reconnection_peers`].
    ///
    /// Skips peers that have recently been active, attempted, or failed.
    ///
    /// ## Correctness
    ///
    /// `AttemptPending` peers will become `Responded` if they respond, or
    /// become `Failed` if they time out or provide a bad response.
    ///
    /// Live `Responded` peers will stay live if they keep responding, or
    /// become a reconnection candidate if they stop responding.
    ///
    /// ## Security
    ///
    /// Zebra resists distributed denial of service attacks by making sure that
    /// new peer connections are initiated at least
    /// [`MIN_PEER_CONNECTION_INTERVAL`][constants::MIN_PEER_CONNECTION_INTERVAL] apart.
    pub async fn next(&mut self) -> Option<MetaAddr> {
        // # Correctness
        //
        // In this critical section, we hold the address mutex, blocking the
        // current thread, and all async tasks scheduled on that thread.
        //
        // To avoid deadlocks, the critical section:
        // - must not acquire any other locks
        // - must not await any futures
        //
        // To avoid hangs, any computation in the critical section should
        // be kept to a minimum.
        let reconnect = {
            let mut guard = self.address_book.lock().unwrap();

            // Now we have the lock, get the current time
            let instant_now = std::time::Instant::now();
            let chrono_now = Utc::now();

            // It's okay to return without sleeping here, because we're returning
            // `None`. We only need to sleep before yielding an address.
            let reconnect = guard.reconnection_peers(instant_now, chrono_now).next()?;

            let reconnect = MetaAddr::new_reconnect(&reconnect.addr);
            guard.update(reconnect)?
        };

        // SECURITY: rate-limit new outbound peer connections
        sleep_until(self.min_next_handshake).await;
        self.min_next_handshake = Instant::now() + constants::MIN_PEER_CONNECTION_INTERVAL;

        Some(reconnect)
    }

    /// Mark `addr` as a failed peer.
    pub fn report_failed(&mut self, addr: &MetaAddr) {
        let addr = MetaAddr::new_errored(&addr.addr, addr.services);
        // # Correctness
        //
        // Briefly hold the address book threaded mutex, to update the state for
        // a single address.
        self.address_book.lock().unwrap().update(addr);
    }
}

/// Check new `addrs` before adding them to the address book.
///
/// `last_seen_limit` is the maximum permitted last seen time, typically
/// [`Utc::now`].
///
/// If the data in an address is invalid, this function can:
/// - modify the address data, or
/// - delete the address.
///
/// # Security
///
/// Adjusts untrusted last seen times so they are not in the future. This stops
/// malicious peers keeping all their addresses at the front of the connection
/// queue. Honest peers with future clock skew also get adjusted.
///
/// Rejects all addresses if any calculated times overflow or underflow.
fn validate_addrs(
    addrs: impl IntoIterator<Item = MetaAddr>,
    last_seen_limit: DateTime32,
) -> impl Iterator<Item = MetaAddr> {
    // Note: The address book handles duplicate addresses internally,
    // so we don't need to de-duplicate addresses here.

    // TODO:
    // We should eventually implement these checks in this function:
    // - Zebra should ignore peers that are older than 3 weeks (part of #1865)
    //   - Zebra should count back 3 weeks from the newest peer timestamp sent
    //     by the other peer, to compensate for clock skew
    // - Zebra should limit the number of addresses it uses from a single Addrs
    //   response (#1869)

    let mut addrs: Vec<_> = addrs.into_iter().collect();

    limit_last_seen_times(&mut addrs, last_seen_limit);

    addrs.into_iter()
}

/// Ensure all reported `last_seen` times are less than or equal to `last_seen_limit`.
///
/// This will consider all addresses as invalid if trying to offset their
/// `last_seen` times to be before the limit causes an underflow.
fn limit_last_seen_times(addrs: &mut Vec<MetaAddr>, last_seen_limit: DateTime32) {
    let last_seen_times = addrs.iter().map(|meta_addr| {
        meta_addr
            .untrusted_last_seen()
            .expect("unexpected missing last seen: should be provided by deserialization")
    });
    let oldest_seen = last_seen_times.clone().min().unwrap_or(DateTime32::MIN);
    let newest_seen = last_seen_times.max().unwrap_or(DateTime32::MAX);

    // If any time is in the future, adjust all times, to compensate for clock skew on honest peers
    if newest_seen > last_seen_limit {
        let offset = newest_seen
            .checked_duration_since(last_seen_limit)
            .expect("unexpected underflow: just checked newest_seen is greater");

        // Check for underflow
        if oldest_seen.checked_sub(offset).is_some() {
            // No underflow is possible, so apply offset to all addresses
            for addr in addrs {
                let last_seen = addr
                    .untrusted_last_seen()
                    .expect("unexpected missing last seen: should be provided by deserialization");
                let last_seen = last_seen
                    .checked_sub(offset)
                    .expect("unexpected underflow: just checked oldest_seen");

                addr.set_untrusted_last_seen(last_seen);
            }
        } else {
            // An underflow will occur, so reject all gossiped peers
            addrs.clear();
        }
    }
}
-												Refactor rate limiting to not store `Sleep` type (#2915)

In newer Tokio versions the `Sleep` type doesn't implement `Unpin`, so
it's a little more complicated to use it. In this case it was easier to
refactor the code to not store the `Sleep` type instead of wrapping it
in a `Pin` type.
											
										
										
											2021-10-21 04:47:04 -07:00
+								use std::{cmp::min, sync::Arc};
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
-												Stop doing thousands of time checks each time we connect to a peer  (#3106)

* Stop checking the entire AddressBook for each connection attempt

* Stop redundant peer time checks within the address book

* Stop calling `Instant::now` 3 times for each address book update

* Only get the time once each time an address book method is called

* Update outdated comment

* Use an OrderedMap to efficiently store address book peers

* Add address book order tests
											
										
										
											2021-12-03 10:09:43 -08:00
+								use chrono::Utc;
-												Suppress unused import warnings.

											
										
										
											2019-10-22 12:48:50 -07:00
+								use futures::stream::{FuturesUnordered, StreamExt};
-												Refactor rate limiting to not store `Sleep` type (#2915)

In newer Tokio versions the `Sleep` type doesn't implement `Unpin`, so
it's a little more complicated to use it. In this case it was easier to
refactor the code to not store the `Sleep` type instead of wrapping it
in a `Pin` type.
											
										
										
											2021-10-21 04:47:04 -07:00
+								use tokio::time::{sleep_until, timeout, Instant};
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								use tower::{Service, ServiceExt};
-												Use `DateTime32` in `validate_addrs`

											
										
										
											2021-05-31 06:49:59 -07:00
+								use zebra_chain::serialization::DateTime32;
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								use crate::{constants, types::MetaAddr, AddressBook, BoxError, Request, Response};
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
-												Test if validation offsets times in the future

Use some mock gossiped peers that all have `last_seen` times in the
future and check that they all have a specific offset applied to them.

											
										
										
											2021-05-21 14:52:36 -07:00
+								#[cfg(test)]
 								mod tests;
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								/// The [`CandidateSet`] manages outbound peer connection attempts.
 								/// Successful connections become peers in the [`PeerSet`].
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								///
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								/// The candidate set divides the set of all possible outbound peers into
 								/// disjoint subsets, using the [`PeerAddrState`]:
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								///
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								/// 1. [`Responded`] peers, which we have had an outbound connection to.
 								/// 2. [`NeverAttemptedGossiped`] peers, which we learned about from other peers
 								///    but have never connected to.
 								/// 3. [`NeverAttemptedAlternate`] peers, canonical addresses which we learned
 								///    from the [`Version`] messages of inbound and outbound connections,
 								///    but have never connected to.
 								/// 4. [`Failed`] peers, which failed a connection attempt, or had an error
 								///    during an outbound connection.
 								/// 5. [`AttemptPending`] peers, which we've recently queued for a connection.
 								///
 								/// Never attempted peers are always available for connection.
 								///
 								/// If a peer's attempted, responded, or failure time is recent
 								/// (within the liveness limit), we avoid reconnecting to it.
 								/// Otherwise, we assume that it has disconnected or hung,
 								/// and attempt reconnection.
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								///
 								/// ```ascii,no_run
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								///                         ┌──────────────────┐
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								///                         │   Config / DNS   │
 								///             ┌───────────│       Seed       │───────────┐
 								///             │           │    Addresses     │           │
 								///             │           └──────────────────┘           │
 								///             │                    │ untrusted_last_seen │
 								///             │                    │     is unknown      │
 								///             ▼                    │                     ▼
 								///    ┌──────────────────┐          │          ┌──────────────────┐
 								///    │    Handshake     │          │          │     Peer Set     │
 								///    │    Canonical     │──────────┼──────────│     Gossiped     │
 								///    │    Addresses     │          │          │    Addresses     │
 								///    └──────────────────┘          │          └──────────────────┘
 								///     untrusted_last_seen          │                provides
 								///         set to now               │           untrusted_last_seen
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								///                                  ▼
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								///                                  Λ   if attempted, responded, or failed:
 								///                                 ╱ ╲         ignore gossiped info
 								///                                ▕   ▏    otherwise, if never attempted:
 								///                                 ╲ ╱    skip updates to existing fields
 								///                                  V
 								///  ┌───────────────────────────────┼───────────────────────────────┐
 								///  │ AddressBook                   │                               │
 								///  │ disjoint `PeerAddrState`s     ▼                               │
 								///  │ ┌─────────────┐  ┌─────────────────────────┐  ┌─────────────┐ │
 								///  │ │ `Responded` │  │`NeverAttemptedGossiped` │  │  `Failed`   │ │
 								/// ┌┼▶│    Peers    │  │`NeverAttemptedAlternate`│  │   Peers     │◀┼┐
 								/// ││ │             │  │          Peers          │  │             │ ││
 								/// ││ └─────────────┘  └─────────────────────────┘  └─────────────┘ ││
 								/// ││        │                      │                      │        ││
 								/// ││ #1 oldest_first        #2 newest_first        #3 oldest_first ││
 								/// ││        ├──────────────────────┴──────────────────────┘        ││
 								/// ││        ▼                                                      ││
 								/// ││        Λ                                                      ││
 								/// ││       ╱ ╲              filter by                              ││
-												Security: Zebra should stop gossiping unreachable addresses to other nodes, Action: re-deploy all nodes (#2392)

* Rename some methods and constants for clarity

Using the following commands:

```
fastmod '\bis_ready_for_attempt\b' is_ready_for_connection_attempt
  # One instance required a tweak, because of the ASCII diagram.
fastmod '\bwas_recently_live\b' has_connection_recently_responded
fastmod '\bwas_recently_attempted\b' was_connection_recently_attempted
fastmod '\bwas_recently_failed\b' has_connection_recently_failed
fastmod '\bLIVE_PEER_DURATION\b' MIN_PEER_RECONNECTION_DELAY
```

* Use `Instant::elapsed` for conciseness

Instead of `Instant::now().saturating_duration_since`. They're both
equivalent, and `elapsed` only panics if the `Instant` is somehow
synthetically generated.

* Allow `Duration32` to be created in other crates

Export the `Duration32` from the `zebra_chain::serialization` module.

* Add some new `Duration32` constructors

Create some helper `const` constructors to make it easy to create
constant durations. Add methods to create a `Duration32` from seconds,
minutes and hours.

* Avoid gossiping unreachable peers

When sanitizing the list of peers to gossip, remove those that we
haven't seen in more than three hours.

* Test if unreachable addresses aren't gossiped

Create a property test with random addreses inserted into an
`AddressBook`, and verify that the sanitized list of addresses does not
contain any addresses considered unreachable.

* Test if new alternate address isn't gossipable

Create a new alternate peer, because that type of `MetaAddr` does not
have `last_response` or `untrusted_last_seen` times. Verify that the
peer is not considered gossipable.

* Test if local listener is gossipable

The `MetaAddr` representing the local peer's listening address should
always be considered gossipable.

* Test if gossiped peer recently seen is gossipable

Create a `MetaAddr` representing a gossiped peer that was reported to be
seen recently. Check that the peer is considered gossipable.

* Test peer reportedly last seen in the future

Create a `MetaAddr` representing a peer gossiped and reported to have
been last seen in a time that's in the future. Check that the peer is
considered gossipable, to check that the fallback calculation is working
as intended.

* Test gossiped peer reportedly seen long ago

Create a `MetaAddr` representing a gossiped peer that was reported to
last have been seen a long time ago. Check that the peer is not
considered gossipable.

* Test if just responded peer is gossipable

Create a `MetaAddr` representing a peer that has just responded and
check that it is considered gossipable.

* Test if recently responded peer is gossipable

Create a `MetaAddr` representing a peer that last responded within the
duration a peer is considered reachable. Verify that the peer is
considered gossipable.

* Test peer that responded long ago isn't gossipable

Create a `MetaAddr` representing a peer that last responded outside the
duration a peer is considered reachable. Verify that the peer is not
considered gossipable.

											
										
										
											2021-06-28 22:12:27 -07:00
+								/// ││      ▕   ▏        is_ready_for_connection_attempt             ││
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								/// ││       ╲ ╱    to remove recent `Responded`,                    ││
 								/// ││        V  `AttemptPending`, and `Failed` peers                ││
 								/// ││        │                                                      ││
 								/// ││        │    try outbound connection,                          ││
 								/// ││        ▼  update last_attempt to now()                        ││
 								/// ││┌────────────────┐                                             ││
 								/// │││`AttemptPending`│                                             ││
 								/// │││     Peers      │                                             ││
 								/// ││└────────────────┘                                             ││
 								/// │└────────┼──────────────────────────────────────────────────────┘│
 								/// │         ▼                                                       │
 								/// │         Λ                                                       │
 								/// │        ╱ ╲                                                      │
 								/// │       ▕   ▏─────────────────────────────────────────────────────┘
 								/// │        ╲ ╱   connection failed, update last_failure to now()
 								/// │         V
 								/// │         │
 								/// │         │ connection succeeded
 								/// │         ▼
 								/// │  ┌────────────┐
 								/// │  │    send    │
 								/// │  │peer::Client│
 								/// │  │to Discover │
 								/// │  └────────────┘
 								/// │         │
 								/// │         ▼
 								/// │┌───────────────────────────────────────┐
 								/// ││ every time we receive a peer message: │
 								/// └│  * update state to `Responded`        │
 								///  │  * update last_response to now()      │
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								///  └───────────────────────────────────────┘
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								/// ```
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								// TODO:
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								//   * show all possible transitions between Attempt/Responded/Failed,
 								//     except Failed -> Responded is invalid, must go through Attempt
 								//   * for now, seed peers go straight to handshaking and responded,
 								//     but we'll fix that once we add the Seed state
 								// When we add the Seed state:
 								//   * show that seed peers that transition to other never attempted
 								//     states are already in the address book
 								pub(crate) struct CandidateSet<S> {
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								    pub(super) address_book: Arc<std::sync::Mutex<AddressBook>>,
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								    pub(super) peer_service: S,
-												Refactor rate limiting to not store `Sleep` type (#2915)

In newer Tokio versions the `Sleep` type doesn't implement `Unpin`, so
it's a little more complicated to use it. In this case it was easier to
refactor the code to not store the `Sleep` type instead of wrapping it
in a `Pin` type.
											
										
										
											2021-10-21 04:47:04 -07:00
+								    min_next_handshake: Instant,
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								    min_next_crawl: Instant,
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								}
 								impl<S> CandidateSet<S>
 								where
-												network: rename alias to BoxError

This is shorter and consistent with Tower (which is why we use it in the
first place).

											
										
										
											2020-09-18 11:20:55 -07:00
+								    S: Service<Request, Response = Response, Error = BoxError>,
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								    S::Future: Send + 'static,
 								{
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								    /// Uses `address_book` and `peer_service` to manage a [`CandidateSet`] of peers.
 								    pub fn new(
 								        address_book: Arc<std::sync::Mutex<AddressBook>>,
 								        peer_service: S,
 								    ) -> CandidateSet<S> {
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								        CandidateSet {
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								            address_book,
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								            peer_service,
-												Refactor rate limiting to not store `Sleep` type (#2915)

In newer Tokio versions the `Sleep` type doesn't implement `Unpin`, so
it's a little more complicated to use it. In this case it was easier to
refactor the code to not store the `Sleep` type instead of wrapping it
in a `Pin` type.
											
										
										
											2021-10-21 04:47:04 -07:00
+								            min_next_handshake: Instant::now(),
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								            min_next_crawl: Instant::now(),
-												Move the CandidateSet to its own file.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 21:25:49 -07:00
+								        }
 								    }
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    /// Update the peer set from the network, using the default fanout limit.
 								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// See [`update_initial`][Self::update_initial] for details.
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    pub async fn update(&mut self) -> Result<(), BoxError> {
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        self.update_timeout(None).await
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    }
 								    /// Update the peer set from the network, limiting the fanout to
 								    /// `fanout_limit`.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    /// - Ask a few live [`Responded`] peers to send us more peers.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    /// - Process all completed peer responses, adding new peers in the
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    ///   [`NeverAttemptedGossiped`] state.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
 								    /// ## Correctness
 								    ///
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    /// Pass the initial peer set size as `fanout_limit` during initialization,
 								    /// so that Zebra does not send duplicate requests to the same peer.
 								    ///
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								    /// The crawler exits when update returns an error, so it must only return
 								    /// errors on permanent failures.
 								    ///
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    /// The handshaker sets up the peer message receiver so it also sends a
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    /// [`Responded`] peer address update.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// [`report_failed`][Self::report_failed] puts peers into the [`Failed`] state.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// [`next`][Self::next] puts peers into the [`AttemptPending`] state.
 								    ///
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								    /// ## Security
 								    ///
 								    /// This call is rate-limited to prevent sending a burst of repeated requests for new peer
 								    /// addresses. Each call will only update the [`CandidateSet`] if more time
 								    /// than [`MIN_PEER_GET_ADDR_INTERVAL`][constants::MIN_PEER_GET_ADDR_INTERVAL] has passed since
 								    /// the last call. Otherwise, the update is skipped.
 								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// [`Responded`]: crate::PeerAddrState::Responded
 								    /// [`NeverAttemptedGossiped`]: crate::PeerAddrState::NeverAttemptedGossiped
 								    /// [`Failed`]: crate::PeerAddrState::Failed
 								    /// [`AttemptPending`]: crate::PeerAddrState::AttemptPending
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    pub async fn update_initial(&mut self, fanout_limit: usize) -> Result<(), BoxError> {
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        self.update_timeout(Some(fanout_limit)).await
 								    }
 								    /// Update the peer set from the network, limiting the fanout to
 								    /// `fanout_limit`, and imposing a timeout on the entire fanout.
 								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// See [`update_initial`][Self::update_initial] for details.
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    async fn update_timeout(&mut self, fanout_limit: Option<usize>) -> Result<(), BoxError> {
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								        // SECURITY
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        //
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								        // Rate limit sending `GetAddr` messages to peers.
 								        if self.min_next_crawl <= Instant::now() {
 								            // CORRECTNESS
 								            //
 								            // Use a timeout to avoid deadlocks when there are no connected
 								            // peers, and:
 								            // - we're waiting on a handshake to complete so there are peers, or
 								            // - another task that handles or adds peers is waiting on this task
 								            //   to complete.
-												Avoid spurious acceptance test failures by decreasing the peer crawler timeout (#2905)

* Improve logging for initial peer connections

* Decrease the initial peer crawl timeout to make tests more reliable

Co-authored-by: Conrado Gouvea <conrado@zfnd.org>
											
										
										
											2021-10-19 08:29:03 -07:00
+								            if let Ok(fanout_result) = timeout(
-												Fix slow Zebra startup times, to reduce CI failures (#3104)

* Tweak a log message

* Only retry failed DNS once, then use the other DNS responses

* Limit broadcasts to half the peers

* Use a longer minimum interval for GetAddr requests

* Reduce the syncer and mempool crawler fanouts

* Stop resetting the mempool twice when it starts up

This spawns two crawlers, which send two fanouts,
so it can use up a lot of peers.

Co-authored-by: Conrado Gouvea <conrado@zfnd.org>
Co-authored-by: Alfredo Garcia <oxarbitrage@gmail.com>
											
										
										
											2021-11-30 13:04:32 -08:00
+								                constants::PEER_GET_ADDR_TIMEOUT,
-												Avoid spurious acceptance test failures by decreasing the peer crawler timeout (#2905)

* Improve logging for initial peer connections

* Decrease the initial peer crawl timeout to make tests more reliable

Co-authored-by: Conrado Gouvea <conrado@zfnd.org>
											
										
										
											2021-10-19 08:29:03 -07:00
+								                self.update_fanout(fanout_limit),
 								            )
 								            .await
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								            {
 								                fanout_result?;
 								            } else {
 								                // update must only return an error for permanent failures
-												Fix slow Zebra startup times, to reduce CI failures (#3104)

* Tweak a log message

* Only retry failed DNS once, then use the other DNS responses

* Limit broadcasts to half the peers

* Use a longer minimum interval for GetAddr requests

* Reduce the syncer and mempool crawler fanouts

* Stop resetting the mempool twice when it starts up

This spawns two crawlers, which send two fanouts,
so it can use up a lot of peers.

Co-authored-by: Conrado Gouvea <conrado@zfnd.org>
Co-authored-by: Alfredo Garcia <oxarbitrage@gmail.com>
											
										
										
											2021-11-30 13:04:32 -08:00
+								                info!("timeout waiting for peer service readiness or peer responses");
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								            }
 								            self.min_next_crawl = Instant::now() + constants::MIN_PEER_GET_ADDR_INTERVAL;
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        }
 								        Ok(())
-												Limit initial candidate set fanout to the number of initial peers

If there is a small number of initial peers, and they are slow, the
initial candidate set update can appear to hang. To avoid this issue,
limit the initial candidate set fanout to the number of initial peers.

Once the initial peers have sent us more peer addresses, there is no need
to limit the fanouts for future updates.

Reported by Niklas Long of Equilibrium.

											
										
										
											2021-05-13 19:15:39 -07:00
+								    }
 								    /// Update the peer set from the network, limiting the fanout to
 								    /// `fanout_limit`.
 								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// See [`update_initial`][Self::update_initial]  for details.
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    ///
 								    /// # Correctness
 								    ///
-												Fix some candidate set and meta addr doc links (#2174)

Suggested by jvff.
											
										
										
											2021-05-20 18:40:14 -07:00
+								    /// This function does not have a timeout.
 								    /// Use [`update_timeout`][Self::update_timeout] instead.
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    async fn update_fanout(&mut self, fanout_limit: Option<usize>) -> Result<(), BoxError> {
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								        // Opportunistically crawl the network on every update call to ensure
 								        // we're actively fetching peers. Continue independently of whether we
 								        // actually receive any peers, but always ask the network for more.
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								        //
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								        // Because requests are load-balanced across existing peers, we can make
 								        // multiple requests concurrently, which will be randomly assigned to
 								        // existing peers, but we don't make too many because update may be
 								        // called while the peer set is already loaded.
 								        let mut responses = FuturesUnordered::new();
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        let fanout_limit = fanout_limit
 								            .map(|fanout_limit| min(fanout_limit, constants::GET_ADDR_FANOUT))
 								            .unwrap_or(constants::GET_ADDR_FANOUT);
 								        debug!(?fanout_limit, "sending GetPeers requests");
 								        // TODO: launch each fanout in its own task (might require tokio 1.6)
 								        for _ in 0..fanout_limit {
-												Update to Tokio 1.13.0 (#2994)

* Update `tower` to version `0.4.9`

Update to latest version to add support for Tokio version 1.

* Replace usage of `ServiceExt::ready_and`

It was deprecated in favor of `ServiceExt::ready`.

* Update Tokio dependency to version `1.13.0`

This will break the build because the code isn't ready for the update,
but future commits will fix the issues.

* Replace import of `tokio::stream::StreamExt`

Use `futures::stream::StreamExt` instead, because newer versions of
Tokio don't have the `stream` feature.

* Use `IntervalStream` in `zebra-network`

In newer versions of Tokio `Interval` doesn't implement `Stream`, so the
wrapper types from `tokio-stream` have to be used instead.

* Use `IntervalStream` in `inventory_registry`

In newer versions of Tokio the `Interval` type doesn't implement
`Stream`, so `tokio_stream::wrappers::IntervalStream` has to be used
instead.

* Use `BroadcastStream` in `inventory_registry`

In newer versions of Tokio `broadcast::Receiver` doesn't implement
`Stream`, so `tokio_stream::wrappers::BroadcastStream` instead. This
also requires changing the error type that is used.

* Handle `Semaphore::acquire` error in `tower-batch`

Newer versions of Tokio can return an error if the semaphore is closed.
This shouldn't happen in `tower-batch` because the semaphore is never
closed.

* Handle `Semaphore::acquire` error in `zebrad` test

On newer versions of Tokio `Semaphore::acquire` can return an error if
the semaphore is closed. This shouldn't happen in the test because the
semaphore is never closed.

* Update some `zebra-network` dependencies

Use versions compatible with Tokio version 1.

* Upgrade Hyper to version 0.14

Use a version that supports Tokio version 1.

* Update `metrics` dependency to version 0.17

And also update the `metrics-exporter-prometheus` to version 0.6.1.
These updates are to make sure Tokio 1 is supported.

* Use `f64` as the histogram data type

`u64` isn't supported as the histogram data type in newer versions of
`metrics`.

* Update the initialization of the metrics component

Make it compatible with the new version of `metrics`.

* Simplify build version counter

Remove all constants and use the new `metrics::incement_counter!` macro.

* Change metrics output line to match on

The snapshot string isn't included in the newer version of
`metrics-exporter-prometheus`.

* Update `sentry` to version 0.23.0

Use a version compatible with Tokio version 1.

* Remove usage of `TracingIntegration`

This seems to not be available from `sentry-tracing` anymore, so it
needs to be replaced.

* Add sentry layer to tracing initialization

This seems like the replacement for `TracingIntegration`.

* Remove unnecessary conversion

Suggested by a Clippy lint.

* Update Cargo lock file

Apply all of the updates to dependencies.

* Ban duplicate tokio dependencies

Also ban git sources for tokio dependencies.

* Stop allowing sentry-tracing git repository in `deny.toml`

* Allow remaining duplicates after the tokio upgrade

* Use C: drive for CI build output on Windows

GitHub Actions uses a Windows image with two disk drives, and the
default D: drive is smaller than the C: drive. Zebra currently uses a
lot of space to build, so it has to use the C: drive to avoid CI build
failures because of insufficient space.

Co-authored-by: teor <teor@riseup.net>
											
										
										
											2021-11-02 11:46:57 -07:00
+								            let peer_service = self.peer_service.ready().await?;
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								            responses.push(peer_service.call(Request::Peers));
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								        }
 								        while let Some(rsp) = responses.next().await {
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								            match rsp {
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								                Ok(Response::Peers(addrs)) => {
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								                    trace!(
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								                        addr_count = ?addrs.len(),
 								                        ?addrs,
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								                        "got response to GetPeers"
 								                    );
-												Use `DateTime32` in `validate_addrs`

											
										
										
											2021-05-31 06:49:59 -07:00
+								                    let addrs = validate_addrs(addrs, DateTime32::now());
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								                    self.send_addrs(addrs);
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								                }
 								                Err(e) => {
 								                    // since we do a fanout, and new updates are triggered by
 								                    // each demand, we can ignore errors in individual responses
 								                    trace!(?e, "got error in GetPeers request");
 								                }
 								                Ok(_) => unreachable!("Peers requests always return Peers responses"),
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								            }
 								        }
 								        Ok(())
 								    }
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								    /// Add new `addrs` to the address book.
 								    fn send_addrs(&self, addrs: impl IntoIterator<Item = MetaAddr>) {
-												Make `services` field in `MetaAddr` optional (#2976)

* Use `prop_assert` instead of `assert`

Otherwise the test input isn't minimized.

* Split long string into a multi-line string

And add some newlines to try to improve readability.

* Fix referenced issue number

They had a typo in their number.

* Make peer services optional

It is unknown for initial peers.

* Fix `preserve_initial_untrusted_values` test

Now that it's optional, the services field can be written to if it was
previously empty.

* Fix formatting of property tests

Run rustfmt on them.

* Restore `TODO` comment

Make it easy to find planned improvements in the code.

Co-authored-by: teor <teor@riseup.net>

* Comment on how ordering is affected

Make it clear that missing services causes the peer to be chosen last.

Co-authored-by: teor <teor@riseup.net>

* Don't expect `services` to be available

Avoid a panic by using the compiler to help enforce the handling of the
case correctly.

* Panic if received gossiped address has no services

All received gossiped addresses have services. The only addresses that
don't have services configured are the initial seed addresses.

Co-authored-by: teor <teor@riseup.net>
											
										
										
											2021-11-01 19:45:35 -07:00
+								        let addrs = addrs
 								            .into_iter()
 								            .map(MetaAddr::new_gossiped_change)
 								            .map(|maybe_addr| {
 								                maybe_addr.expect("Received gossiped peers always have services set")
 								            });
-												Security: stop gossiping failure and attempt times as last_seen times (#2273)

* Security: stop gossiping failure and attempt times as last_seen times

Previously, Zebra had a single time field for peer addresses, which was
updated every time a peer was attempted, sent a message, or failed.

This is a security issue, because the `last_seen` time should be
"the last time [a peer] connected to that node", so that
"nodes can use the time field to avoid relaying old 'addr' messages".
So Zebra was sending incorrect peer information to other nodes.

As part of this change, we split the `last_seen` time into the
following fields:
- untrusted_last_seen: gossiped from other peers
- last_response: time we got a response from a directly connected peer
- last_attempt: time we attempted to connect to a peer
- last_failure: time a connection with a peer failed

* Implement Arbitrary and strategies for MetaAddrChange

Also replace the MetaAddr Arbitrary impl with a derive.

* Write proptests for MetaAddr and MetaAddrChange

MetaAddr:
- the only times that get included in serialized MetaAddrs are
  the untrusted last seen and responded times

MetaAddrChange:
- the untrusted last seen time is never updated
- the services are only updated if there has been a handshake
											
										
										
											2021-06-14 20:31:16 -07:00
-												Security: Fix CandidateSet timeout and fanout

* Refactor: Split CandidateSet::update into separate functions
* Security: Apply a timeout to the entire CandidateSet::update
* Security: Stop using very large fanout limits during initialization

Previously, Zebra used the number of resolved peer addresses.
So it was possible for all peers to fail, and for Zebra to hang on the
first update.

And Zebra could send a fanout for each initial peer, regardless
of whether their connection was successful.

Also:
- wait for at least one successful peer before trying an update
- warn if there are no successful initial peers

											
										
										
											2021-05-20 01:15:46 -07:00
+								        // # Correctness
 								        //
 								        // Briefly hold the address book threaded mutex, to extend
 								        // the address list.
 								        //
 								        // Extend handles duplicate addresses internally.
 								        self.address_book.lock().unwrap().extend(addrs);
 								    }
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    /// Returns the next candidate for a connection attempt, if any are available.
 								    ///
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								    /// Returns peers in reconnection order, based on
 								    /// [`AddressBook::reconnection_peers`].
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
-												Security: Limit reconnection rate to individual peers (#2275)

* Security: Limit reconnection rate to individual peers

Reconnection Rate

Limit the reconnection rate to each individual peer by applying the
liveness cutoff to the attempt, responded, and failure time fields.
If any field is recent, the peer is skipped.

The new liveness cutoff skips any peers that have recently been attempted
or failed. (Previously, the liveness check was only applied if the peer
was in the `Responded` state, which could lead to repeated retries of
`Failed` peers, particularly in small address books.)

Reconnection Order

Zebra prefers more useful peer states, then the earliest attempted,
failed, and responded times, then the most recent gossiped last seen
times.

Before this change, Zebra took the most recent time in all the peer time
fields, and used that time for liveness and ordering. This led to
confusion between trusted and untrusted data, and success and failure
times.

Unlike the previous order, the new order:
- tries all peers in each state, before re-trying any peer in that state,
  and
- only checks the the gossiped untrusted last seen time
  if all other times are equal.

* Preserve the later time if changes arrive out of order

* Update CandidateSet::next documentation

* Update CandidateSet state diagram

* Fix variant names in comments

* Explain why timestamps can be left out of MetaAddrChanges

* Add a simple test for the individual peer retry limit

* Only generate valid Arbitrary PeerServices values

* Add an individual peer retry limit AddressBook and CandidateSet test

* Stop deleting recently live addresses from the address book

If we delete recently live addresses from the address book, we can get a
new entry for them, and reconnect too rapidly.

* Rename functions to match similar tokio API

* Fix docs for service sorting

* Clarify a comment

* Cleanup a variable and comments

* Remove blank lines in the CandidateSet state diagram

* Add a multi-peer proptest that checks outbound attempt fairness

* Fix a comment typo

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

* Simplify time maths in MetaAddr

* Create a Duration32 type to simplify calculations and comparisons

* Rename variables for clarity

* Split a string constant into multiple lines

* Make constants match rustdoc order

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-18 05:30:44 -07:00
+								    /// Skips peers that have recently been active, attempted, or failed.
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    ///
 								    /// ## Correctness
 								    ///
 								    /// `AttemptPending` peers will become `Responded` if they respond, or
 								    /// become `Failed` if they time out or provide a bad response.
 								    ///
 								    /// Live `Responded` peers will stay live if they keep responding, or
 								    /// become a reconnection candidate if they stop responding.
-												Implement outbound connection rate limiting - includes config rename with alias (#1855)

* Implement outbound connection rate limiting
* fix breaking change on config

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-03-09 17:36:05 -08:00
+								    ///
 								    /// ## Security
 								    ///
 								    /// Zebra resists distributed denial of service attacks by making sure that
 								    /// new peer connections are initiated at least
-												Rate limit `GetAddr` messages to any peer, Credit: Equilibrium (#2254)

* Rename field to `wait_next_handshake`

Make the name a bit more clear regarding to the field's purpose.

* Move `MIN_PEER_CONNECTION_INTERVAL` to `constants`

Move it to the `constants` module so that it is placed closer to other
constants for consistency and to make it easier to see any relationships
when changing them.

* Rate limit calls to `CandidateSet::update()`

This effectively rate limits requests asking for more peer addresses
sent to the same peer. A new `min_next_crawl` field was added to
`CandidateSet`, and `update` only sends requests for more peer addresses
if the call happens after the instant specified by that field. After
sending the requests, the field value is updated so that there is a
`MIN_PEER_GET_ADDR_INTERVAL` wait time until the next `update` call
sends requests again.

* Include `update_initial` in rate limiting

Move the rate limiting code from `update` to `update_timeout`, so that
both `update` and `update_initial` get rate limited.

* Test `CandidateSet::update` rate limiting

Create a `CandidateSet` that uses a mocked `PeerService`. The mocked
service always returns an empty list of peers, but it also checks that
the requests only happen after expected instants, determined by the
fanout amount and the rate limiting interval.

* Refactor to create a `mock_peer_service` helper

Move the code from the test to a utility function so that another test
will be able to use it as well.

* Check number of times service was called

Use an `AtomicUsize` shared between the service and the test body that
the service increments on every call. The test can then verify if the
service was called the number of times it expected.

* Test calling `update` after `update_initial`

The call to `update` should be skipped because the call to
`update_initial` should also be considered in the rate limiting.

* Mention that call to `update` may be skipped

Make it clearer that in this case the rate limiting causes calls to be
skipped, and not that there's an internal sleep that happens.

Also remove "to the same peers", because it's more general than that.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-06-08 16:42:45 -07:00
+								    /// [`MIN_PEER_CONNECTION_INTERVAL`][constants::MIN_PEER_CONNECTION_INTERVAL] apart.
-												Implement outbound connection rate limiting - includes config rename with alias (#1855)

* Implement outbound connection rate limiting
* fix breaking change on config

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-03-09 17:36:05 -08:00
+								    pub async fn next(&mut self) -> Option<MetaAddr> {
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								        // # Correctness
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								        //
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								        // In this critical section, we hold the address mutex, blocking the
 								        // current thread, and all async tasks scheduled on that thread.
-												Fix a deadlock between the crawler and dialer, and other hangs (#1950)

* Stop ignoring inbound message errors and handshake timeouts

To avoid hangs, Zebra needs to maintain the following invariants in the
handshake and heartbeat code:
- each handshake should run in a separate spawned task
  (not yet implemented)
- every message, error, timeout, and shutdown must update the peer address state
- every await that depends on the network must have a timeout

Once the Connection is created, it should handle timeouts.
But we need to handle timeouts during handshake setup.

* Avoid hangs by adding a timeout to the candidate set update

Also increase the fanout from 1 to 2, to increase address diversity.

But only return permanent errors from `CandidateSet::update`, because
the crawler task exits if `update` returns an error.

Also log Peers response errors in the CandidateSet.

* Use the select macro in the crawler to reduce hangs

The `select` function is biased towards its first argument, risking
starvation.

As a side-benefit, this change also makes the code a lot easier to read
and maintain.

* Split CrawlerAction::Demand into separate actions

This refactor makes the code a bit easier to read, at the cost of
sometimes blocking the crawler on `candidates.next()`.

That's ok, because `next` only has a short (< 100 ms) delay. And we're
just about to spawn a separate task for each handshake.

* Spawn a separate task for each handshake

This change avoids deadlocks by letting each handshake make progress
independently.

* Move the dial task into a separate function

This refactor improves readability.

* Fix buggy future::select function usage

And document the correctness of the new code.
											
										
										
											2021-04-07 06:25:10 -07:00
+								        //
 								        // To avoid deadlocks, the critical section:
 								        // - must not acquire any other locks
 								        // - must not await any futures
 								        //
 								        // To avoid hangs, any computation in the critical section should
 								        // be kept to a minimum.
-												Implement outbound connection rate limiting - includes config rename with alias (#1855)

* Implement outbound connection rate limiting
* fix breaking change on config

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-03-09 17:36:05 -08:00
+								        let reconnect = {
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								            let mut guard = self.address_book.lock().unwrap();
-												Stop doing thousands of time checks each time we connect to a peer  (#3106)

* Stop checking the entire AddressBook for each connection attempt

* Stop redundant peer time checks within the address book

* Stop calling `Instant::now` 3 times for each address book update

* Only get the time once each time an address book method is called

* Update outdated comment

* Use an OrderedMap to efficiently store address book peers

* Add address book order tests
											
										
										
											2021-12-03 10:09:43 -08:00
 								            // Now we have the lock, get the current time
 								            let instant_now = std::time::Instant::now();
 								            let chrono_now = Utc::now();
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								            // It's okay to return without sleeping here, because we're returning
 								            // `None`. We only need to sleep before yielding an address.
-												Stop doing thousands of time checks each time we connect to a peer  (#3106)

* Stop checking the entire AddressBook for each connection attempt

* Stop redundant peer time checks within the address book

* Stop calling `Instant::now` 3 times for each address book update

* Only get the time once each time an address book method is called

* Update outdated comment

* Use an OrderedMap to efficiently store address book peers

* Add address book order tests
											
										
										
											2021-12-03 10:09:43 -08:00
+								            let reconnect = guard.reconnection_peers(instant_now, chrono_now).next()?;
-												Implement outbound connection rate limiting - includes config rename with alias (#1855)

* Implement outbound connection rate limiting
* fix breaking change on config

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-03-09 17:36:05 -08:00
-												Security: stop gossiping failure and attempt times as last_seen times (#2273)

* Security: stop gossiping failure and attempt times as last_seen times

Previously, Zebra had a single time field for peer addresses, which was
updated every time a peer was attempted, sent a message, or failed.

This is a security issue, because the `last_seen` time should be
"the last time [a peer] connected to that node", so that
"nodes can use the time field to avoid relaying old 'addr' messages".
So Zebra was sending incorrect peer information to other nodes.

As part of this change, we split the `last_seen` time into the
following fields:
- untrusted_last_seen: gossiped from other peers
- last_response: time we got a response from a directly connected peer
- last_attempt: time we attempted to connect to a peer
- last_failure: time a connection with a peer failed

* Implement Arbitrary and strategies for MetaAddrChange

Also replace the MetaAddr Arbitrary impl with a derive.

* Write proptests for MetaAddr and MetaAddrChange

MetaAddr:
- the only times that get included in serialized MetaAddrs are
  the untrusted last seen and responded times

MetaAddrChange:
- the untrusted last seen time is never updated
- the services are only updated if there has been a handshake
											
										
										
											2021-06-14 20:31:16 -07:00
+								            let reconnect = MetaAddr::new_reconnect(&reconnect.addr);
 								            guard.update(reconnect)?
-												Implement outbound connection rate limiting - includes config rename with alias (#1855)

* Implement outbound connection rate limiting
* fix breaking change on config

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-03-09 17:36:05 -08:00
+								        };
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
-												Security: only apply the outbound connection rate-limit to actual connections (#2278)

* Only advance the outbound connection timer when it returns an address

Previously, we were advancing the timer even when we returned `None`.
This created large wait times when there were no eligible peers.

* Refactor to avoid overlapping sleep timers

* Add a maximum next peer delay test

Also refactor peer numbers into constants.

* Make the number of proptests overridable by the standard env var

Also cleanup the test constants.

* Test that skipping peer connections also skips their rate limits

* Allow an extra second after each sleep on loaded machines

macOS VMs seem to need this extra time to pass their tests.

* Restart test time bounds from the current time

This change avoids test failures due to cumulative errors.

Also use a single call to `Instant::now` for each test round.
And print the times when the tests fail.

* Stop generating invalid outbound peers in proptests

The candidate set proptests will fail if enough generated peers are
invalid for outbound connections.
											
										
										
											2021-06-14 15:29:17 -07:00
+								        // SECURITY: rate-limit new outbound peer connections
-												Refactor rate limiting to not store `Sleep` type (#2915)

In newer Tokio versions the `Sleep` type doesn't implement `Unpin`, so
it's a little more complicated to use it. In this case it was easier to
refactor the code to not store the `Sleep` type instead of wrapping it
in a `Pin` type.
											
										
										
											2021-10-21 04:47:04 -07:00
+								        sleep_until(self.min_next_handshake).await;
 								        self.min_next_handshake = Instant::now() + constants::MIN_PEER_CONNECTION_INTERVAL;
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
 								        Some(reconnect)
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								    }
-												Fix candidate set address state handling (#1709)

Design:
- Add a `PeerAddrState` to each `MetaAddr`
- Use a single peer set for all peers, regardless of state
- Implement time-based liveness as an `AddressBook` method, rather than
  a `PeerAddrState` variant
- Delete `AddressBook.by_state`

Implementation:
- Simplify `AddressBook` changes using `update` and `take` modifier
  methods
- Simplify the `AddressBook` iterator implementation, replacing it with
  methods that are more obviously correct
- Consistently collect peer set metrics

Documentation:
- Expand and update the peer set documentation

We can optimise later, but for now we want simple code that is more
obviously correct.
											
										
										
											2021-02-17 17:18:32 -08:00
+								    /// Mark `addr` as a failed peer.
-												Add more methods for creating MetaAddrs

This refactor lets us remove `MetaAddr::update_last_seen()`.

											
										
										
											2021-03-25 03:14:52 -07:00
+								    pub fn report_failed(&mut self, addr: &MetaAddr) {
-												Security: stop gossiping failure and attempt times as last_seen times (#2273)

* Security: stop gossiping failure and attempt times as last_seen times

Previously, Zebra had a single time field for peer addresses, which was
updated every time a peer was attempted, sent a message, or failed.

This is a security issue, because the `last_seen` time should be
"the last time [a peer] connected to that node", so that
"nodes can use the time field to avoid relaying old 'addr' messages".
So Zebra was sending incorrect peer information to other nodes.

As part of this change, we split the `last_seen` time into the
following fields:
- untrusted_last_seen: gossiped from other peers
- last_response: time we got a response from a directly connected peer
- last_attempt: time we attempted to connect to a peer
- last_failure: time a connection with a peer failed

* Implement Arbitrary and strategies for MetaAddrChange

Also replace the MetaAddr Arbitrary impl with a derive.

* Write proptests for MetaAddr and MetaAddrChange

MetaAddr:
- the only times that get included in serialized MetaAddrs are
  the untrusted last seen and responded times

MetaAddrChange:
- the untrusted last seen time is never updated
- the services are only updated if there has been a handshake
											
										
										
											2021-06-14 20:31:16 -07:00
+								        let addr = MetaAddr::new_errored(&addr.addr, addr.services);
-												Refactor and document correctness for std::sync::Mutex<AddressBook>

											
										
										
											2021-04-18 23:04:24 -07:00
+								        // # Correctness
 								        //
 								        // Briefly hold the address book threaded mutex, to update the state for
 								        // a single address.
 								        self.address_book.lock().unwrap().update(addr);
-												Initial work to add a crawl-and-dial task.

This responds to peerset demand by connecting to additional peers.

Co-authored-by: Deirdre Connolly <deirdre@zfnd.org>

											
										
										
											2019-10-21 15:24:17 -07:00
+								    }
 								}
-												Remove CandidateSet state and add last seen time limit to candidate_set::validate_addrs (#2177)



											
										
										
											2021-05-20 19:21:13 -07:00
 								/// Check new `addrs` before adding them to the address book.
 								///
 								/// `last_seen_limit` is the maximum permitted last seen time, typically
 								/// [`Utc::now`].
 								///
 								/// If the data in an address is invalid, this function can:
 								/// - modify the address data, or
 								/// - delete the address.
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
+								///
-												Improve documentation

- Make the security impact clearer and in a separate section.
- Instead of listing an assumption as almost a side-note, describe it
  clearly inside a `Panics` section.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-05-31 07:18:01 -07:00
+								/// # Security
 								///
 								/// Adjusts untrusted last seen times so they are not in the future. This stops
 								/// malicious peers keeping all their addresses at the front of the connection
 								/// queue. Honest peers with future clock skew also get adjusted.
-												Handle overflow when applying offset

If an overflow occurs, the reported `last_seen` times are either very
wrong or malicious, so reject all addresses gossiped by that peer.

											
										
										
											2021-05-25 16:31:52 -07:00
+								///
-												Update security note to be broader

Focus on what can go wrong, and not on the specific causes.

Co-authored-by: teor <teor@riseup.net>
											
										
										
											2021-05-26 15:09:02 -07:00
+								/// Rejects all addresses if any calculated times overflow or underflow.
-												Remove CandidateSet state and add last seen time limit to candidate_set::validate_addrs (#2177)



											
										
										
											2021-05-20 19:21:13 -07:00
+								fn validate_addrs(
 								    addrs: impl IntoIterator<Item = MetaAddr>,
-												Use `DateTime32` in `validate_addrs`

											
										
										
											2021-05-31 06:49:59 -07:00
+								    last_seen_limit: DateTime32,
-												Improve ergonomics by returning `impl Iterator`

Returning `impl IntoIterator` means that the caller will always be
forced to call `.into_iter()`, and returning `impl Iterator` still
allows them to call `.into_iter()` because it becomes the identity
function.

											
										
										
											2021-05-21 14:41:26 -07:00
+								) -> impl Iterator<Item = MetaAddr> {
-												Remove CandidateSet state and add last seen time limit to candidate_set::validate_addrs (#2177)



											
										
										
											2021-05-20 19:21:13 -07:00
+								    // Note: The address book handles duplicate addresses internally,
 								    // so we don't need to de-duplicate addresses here.
 								    // TODO:
 								    // We should eventually implement these checks in this function:
 								    // - Zebra should ignore peers that are older than 3 weeks (part of #1865)
 								    //   - Zebra should count back 3 weeks from the newest peer timestamp sent
 								    //     by the other peer, to compensate for clock skew
 								    // - Zebra should limit the number of addresses it uses from a single Addrs
 								    //   response (#1869)
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
+								    let mut addrs: Vec<_> = addrs.into_iter().collect();
-												Remove empty list of peers check

The `limit_last_seen_times` can now safely handle an empty list.

											
										
										
											2021-05-27 14:29:49 -07:00
+								    limit_last_seen_times(&mut addrs, last_seen_limit);
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
-												Improve ergonomics by returning `impl Iterator`

Returning `impl IntoIterator` means that the caller will always be
forced to call `.into_iter()`, and returning `impl Iterator` still
allows them to call `.into_iter()` because it becomes the identity
function.

											
										
										
											2021-05-21 14:41:26 -07:00
+								    addrs.into_iter()
-												Remove CandidateSet state and add last seen time limit to candidate_set::validate_addrs (#2177)



											
										
										
											2021-05-20 19:21:13 -07:00
+								}
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
 								/// Ensure all reported `last_seen` times are less than or equal to `last_seen_limit`.
 								///
-												Handle overflow when applying offset

If an overflow occurs, the reported `last_seen` times are either very
wrong or malicious, so reject all addresses gossiped by that peer.

											
										
										
											2021-05-25 16:31:52 -07:00
+								/// This will consider all addresses as invalid if trying to offset their
-												Fix comment typo: overflow -> underflow
											
										
										
											2021-05-31 23:44:45 -07:00
+								/// `last_seen` times to be before the limit causes an underflow.
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
+								fn limit_last_seen_times(addrs: &mut Vec<MetaAddr>, last_seen_limit: DateTime32) {
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								    let last_seen_times = addrs.iter().map(|meta_addr| {
 								        meta_addr
 								            .untrusted_last_seen()
 								            .expect("unexpected missing last seen: should be provided by deserialization")
 								    });
 								    let oldest_seen = last_seen_times.clone().min().unwrap_or(DateTime32::MIN);
 								    let newest_seen = last_seen_times.max().unwrap_or(DateTime32::MAX);
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
-												Add comment to describe purpose

Make it clear why all peers have the time offset applied to them.

Co-authored-by: teor <teor@riseup.net>

											
										
										
											2021-05-24 17:59:09 -07:00
+								    // If any time is in the future, adjust all times, to compensate for clock skew on honest peers
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								    if newest_seen > last_seen_limit {
 								        let offset = newest_seen
 								            .checked_duration_since(last_seen_limit)
 								            .expect("unexpected underflow: just checked newest_seen is greater");
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								        // Check for underflow
 								        if oldest_seen.checked_sub(offset).is_some() {
-												Fix comment typo: overflow -> underflow
											
										
										
											2021-05-31 23:44:45 -07:00
+								            // No underflow is possible, so apply offset to all addresses
-												Handle overflow when applying offset

If an overflow occurs, the reported `last_seen` times are either very
wrong or malicious, so reject all addresses gossiped by that peer.

											
										
										
											2021-05-25 16:31:52 -07:00
+								            for addr in addrs {
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								                let last_seen = addr
-												Security: stop gossiping failure and attempt times as last_seen times (#2273)

* Security: stop gossiping failure and attempt times as last_seen times

Previously, Zebra had a single time field for peer addresses, which was
updated every time a peer was attempted, sent a message, or failed.

This is a security issue, because the `last_seen` time should be
"the last time [a peer] connected to that node", so that
"nodes can use the time field to avoid relaying old 'addr' messages".
So Zebra was sending incorrect peer information to other nodes.

As part of this change, we split the `last_seen` time into the
following fields:
- untrusted_last_seen: gossiped from other peers
- last_response: time we got a response from a directly connected peer
- last_attempt: time we attempted to connect to a peer
- last_failure: time a connection with a peer failed

* Implement Arbitrary and strategies for MetaAddrChange

Also replace the MetaAddr Arbitrary impl with a derive.

* Write proptests for MetaAddr and MetaAddrChange

MetaAddr:
- the only times that get included in serialized MetaAddrs are
  the untrusted last seen and responded times

MetaAddrChange:
- the untrusted last seen time is never updated
- the services are only updated if there has been a handshake
											
										
										
											2021-06-14 20:31:16 -07:00
+								                    .untrusted_last_seen()
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								                    .expect("unexpected missing last seen: should be provided by deserialization");
 								                let last_seen = last_seen
 								                    .checked_sub(offset)
 								                    .expect("unexpected underflow: just checked oldest_seen");
-												Only apply offset to times in the future

Times in the past don't have any security implications, so there's no
point in trying to apply the offset to them as well.

											
										
										
											2021-05-31 07:00:34 -07:00
-												Security: Use canonical SocketAddrs to avoid duplicate peer connections, Feature: Send local listener to peers (#2276)

* Always send our local listener with the latest time

Previously, whenever there was an inbound request for peers, we would
clone the address book and update it with the local listener.

This had two impacts:
- the listener could conflict with an existing entry,
  rather than unconditionally replacing it, and
- the listener was briefly included in the address book metrics.

As a side-effect, this change also makes sanitization slightly faster,
because it avoids some useless peer filtering and sorting.

* Skip listeners that are not valid for outbound connections

* Filter sanitized addresses Zebra based on address state

This fix correctly prevents Zebra gossiping client addresses to peers,
but still keeps the client in the address book to avoid reconnections.

* Add a full set of DateTime32 and Duration32 calculation methods

* Refactor sanitize to use the new DateTime32/Duration32 methods

* Security: Use canonical SocketAddrs to avoid duplicate connections

If we allow multiple variants for each peer address, we can make multiple
connections to that peer.

Also make sure sanitized MetaAddrs are valid for outbound connections.

* Test that address books contain the local listener address

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
											
										
										
											2021-06-21 19:16:59 -07:00
+								                addr.set_untrusted_last_seen(last_seen);
-												Handle overflow when applying offset

If an overflow occurs, the reported `last_seen` times are either very
wrong or malicious, so reject all addresses gossiped by that peer.

											
										
										
											2021-05-25 16:31:52 -07:00
+								            }
 								        } else {
-												Fix comment typo: overflow -> underflow
											
										
										
											2021-05-31 23:44:45 -07:00
+								            // An underflow will occur, so reject all gossiped peers
-												Handle overflow when applying offset

If an overflow occurs, the reported `last_seen` times are either very
wrong or malicious, so reject all addresses gossiped by that peer.

											
										
										
											2021-05-25 16:31:52 -07:00
+								            addrs.clear();
-												Only apply offset to times in the future

Times in the past don't have any security implications, so there's no
point in trying to apply the offset to them as well.

											
										
										
											2021-05-31 07:00:34 -07:00
+								        }
-												Don't trust reported peer `last_seen` times

Due to clock skew, the peers could end up at the front of the
reconnection queue or far at the back. The solution to this is to offset
the reported times by the difference between the most recent reported
sight (in the remote clock) and the current time (in the local clock).

											
										
										
											2021-05-31 06:57:17 -07:00
+								    }
 								}