diff --git a/zebra-network/src/config.rs b/zebra-network/src/config.rs index 2d23304de..dd2707453 100644 --- a/zebra-network/src/config.rs +++ b/zebra-network/src/config.rs @@ -2,6 +2,12 @@ use std::{collections::HashSet, net::SocketAddr, string::String, time::Duration} use zebra_chain::parameters::Network; +use crate::BoxError; + +/// The number of times Zebra will retry each initial peer, before checking if +/// any other initial peers have returned addresses. +const MAX_SINGLE_PEER_RETRIES: usize = 2; + /// Configuration for networking code. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields, default)] @@ -32,17 +38,22 @@ pub struct Config { } impl Config { - /// Concurrently resolves `peers` into zero or more IP addresses, with a timeout - /// of a few seconds on each DNS request. + /// Concurrently resolves `peers` into zero or more IP addresses, with a + /// timeout of a few seconds on each DNS request. /// - /// If DNS resolution fails or times out for all peers, returns an empty list. - async fn parse_peers(peers: &HashSet) -> HashSet { + /// If DNS resolution fails or times out for all peers, continues retrying + /// until at least one peer is found. + async fn resolve_peers(peers: &HashSet) -> HashSet { use futures::stream::StreamExt; loop { + // We retry each peer individually, as well as retrying if there are + // no peers in the combined list. DNS failures are correlated, so all + // peers can fail DNS, leaving Zebra with a small list of custom IP + // address peers. Individual retries avoid this issue. let peer_addresses = peers .iter() - .map(|s| Config::resolve_host(s)) + .map(|s| Config::resolve_host(s, MAX_SINGLE_PEER_RETRIES)) .collect::>() .concat() .await; @@ -64,28 +75,44 @@ impl Config { /// Get the initial seed peers based on the configured network. pub async fn initial_peers(&self) -> HashSet { match self.network { - Network::Mainnet => Config::parse_peers(&self.initial_mainnet_peers).await, - Network::Testnet => Config::parse_peers(&self.initial_testnet_peers).await, + Network::Mainnet => Config::resolve_peers(&self.initial_mainnet_peers).await, + Network::Testnet => Config::resolve_peers(&self.initial_testnet_peers).await, } } + /// Resolves `host` into zero or more IP addresses, retrying up to + /// `max_retries` times. + /// + /// If DNS continues to fail, returns an empty list of addresses. + async fn resolve_host(host: &str, max_retries: usize) -> HashSet { + for retry_count in 1..=max_retries { + match Config::resolve_host_once(host).await { + Ok(addresses) => return addresses, + Err(_) => tracing::info!(?host, ?retry_count, "Retrying peer DNS resolution"), + }; + tokio::time::sleep(crate::constants::DNS_LOOKUP_TIMEOUT).await; + } + + HashSet::new() + } + /// Resolves `host` into zero or more IP addresses. /// /// If `host` is a DNS name, performs DNS resolution with a timeout of a few seconds. - /// If DNS resolution fails or times out, returns an empty list. - async fn resolve_host(host: &str) -> HashSet { + /// If DNS resolution fails or times out, returns an error. + async fn resolve_host_once(host: &str) -> Result, BoxError> { let fut = tokio::net::lookup_host(host); let fut = tokio::time::timeout(crate::constants::DNS_LOOKUP_TIMEOUT, fut); match fut.await { - Ok(Ok(ips)) => ips.collect(), + Ok(Ok(ips)) => Ok(ips.collect()), Ok(Err(e)) => { tracing::info!(?host, ?e, "DNS error resolving peer IP address"); - HashSet::new() + Err(e.into()) } Err(e) => { tracing::info!(?host, ?e, "DNS timeout resolving peer IP address"); - HashSet::new() + Err(e.into()) } } }