Fix a peer DNS resolution edge case (#1796)

* Retry each peer DNS a few times individually

We retry each peer individually, as well as retrying if there are no
peers in the combined list.

DNS failures are correlated, so all peers can fail DNS, leaving Zebra
with a small list of custom-configured IP address peers.

Individual retries avoid this issue.

* Rename parse_peers to resolve_peers

Co-authored-by: Deirdre Connolly <durumcrustulum@gmail.com>
This commit is contained in:
teor 2021-02-26 09:06:27 +10:00 committed by GitHub
parent 70327dc9f5
commit 2587a4e272
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 39 additions and 12 deletions

View File

@ -2,6 +2,12 @@ use std::{collections::HashSet, net::SocketAddr, string::String, time::Duration}
use zebra_chain::parameters::Network; use zebra_chain::parameters::Network;
use crate::BoxError;
/// The number of times Zebra will retry each initial peer, before checking if
/// any other initial peers have returned addresses.
const MAX_SINGLE_PEER_RETRIES: usize = 2;
/// Configuration for networking code. /// Configuration for networking code.
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(deny_unknown_fields, default)] #[serde(deny_unknown_fields, default)]
@ -32,17 +38,22 @@ pub struct Config {
} }
impl Config { impl Config {
/// Concurrently resolves `peers` into zero or more IP addresses, with a timeout /// Concurrently resolves `peers` into zero or more IP addresses, with a
/// of a few seconds on each DNS request. /// timeout of a few seconds on each DNS request.
/// ///
/// If DNS resolution fails or times out for all peers, returns an empty list. /// If DNS resolution fails or times out for all peers, continues retrying
async fn parse_peers(peers: &HashSet<String>) -> HashSet<SocketAddr> { /// until at least one peer is found.
async fn resolve_peers(peers: &HashSet<String>) -> HashSet<SocketAddr> {
use futures::stream::StreamExt; use futures::stream::StreamExt;
loop { loop {
// We retry each peer individually, as well as retrying if there are
// no peers in the combined list. DNS failures are correlated, so all
// peers can fail DNS, leaving Zebra with a small list of custom IP
// address peers. Individual retries avoid this issue.
let peer_addresses = peers let peer_addresses = peers
.iter() .iter()
.map(|s| Config::resolve_host(s)) .map(|s| Config::resolve_host(s, MAX_SINGLE_PEER_RETRIES))
.collect::<futures::stream::FuturesUnordered<_>>() .collect::<futures::stream::FuturesUnordered<_>>()
.concat() .concat()
.await; .await;
@ -64,28 +75,44 @@ impl Config {
/// Get the initial seed peers based on the configured network. /// Get the initial seed peers based on the configured network.
pub async fn initial_peers(&self) -> HashSet<SocketAddr> { pub async fn initial_peers(&self) -> HashSet<SocketAddr> {
match self.network { match self.network {
Network::Mainnet => Config::parse_peers(&self.initial_mainnet_peers).await, Network::Mainnet => Config::resolve_peers(&self.initial_mainnet_peers).await,
Network::Testnet => Config::parse_peers(&self.initial_testnet_peers).await, Network::Testnet => Config::resolve_peers(&self.initial_testnet_peers).await,
} }
} }
/// Resolves `host` into zero or more IP addresses, retrying up to
/// `max_retries` times.
///
/// If DNS continues to fail, returns an empty list of addresses.
async fn resolve_host(host: &str, max_retries: usize) -> HashSet<SocketAddr> {
for retry_count in 1..=max_retries {
match Config::resolve_host_once(host).await {
Ok(addresses) => return addresses,
Err(_) => tracing::info!(?host, ?retry_count, "Retrying peer DNS resolution"),
};
tokio::time::sleep(crate::constants::DNS_LOOKUP_TIMEOUT).await;
}
HashSet::new()
}
/// Resolves `host` into zero or more IP addresses. /// Resolves `host` into zero or more IP addresses.
/// ///
/// If `host` is a DNS name, performs DNS resolution with a timeout of a few seconds. /// If `host` is a DNS name, performs DNS resolution with a timeout of a few seconds.
/// If DNS resolution fails or times out, returns an empty list. /// If DNS resolution fails or times out, returns an error.
async fn resolve_host(host: &str) -> HashSet<SocketAddr> { async fn resolve_host_once(host: &str) -> Result<HashSet<SocketAddr>, BoxError> {
let fut = tokio::net::lookup_host(host); let fut = tokio::net::lookup_host(host);
let fut = tokio::time::timeout(crate::constants::DNS_LOOKUP_TIMEOUT, fut); let fut = tokio::time::timeout(crate::constants::DNS_LOOKUP_TIMEOUT, fut);
match fut.await { match fut.await {
Ok(Ok(ips)) => ips.collect(), Ok(Ok(ips)) => Ok(ips.collect()),
Ok(Err(e)) => { Ok(Err(e)) => {
tracing::info!(?host, ?e, "DNS error resolving peer IP address"); tracing::info!(?host, ?e, "DNS error resolving peer IP address");
HashSet::new() Err(e.into())
} }
Err(e) => { Err(e) => {
tracing::info!(?host, ?e, "DNS timeout resolving peer IP address"); tracing::info!(?host, ?e, "DNS timeout resolving peer IP address");
HashSet::new() Err(e.into())
} }
} }
} }