Rate limit initial genesis block download retries, Credit: Equilibrium (#2255)

* implement and test a rate limit in `request_genesis()`
* add `request_genesis_is_rate_limited` test to sync
* add ensure_timeouts constraint for GENESIS_TIMEOUT_RETRY
* Suppress expected warning logs in zebrad tests

Co-authored-by: teor <teor@riseup.net>
This commit is contained in:
Alfredo Garcia 2021-06-09 20:39:51 -03:00 committed by GitHub
parent 53e50821ec
commit 96a1b661f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 167 additions and 55 deletions

View File

@ -37,6 +37,7 @@ pub fn init() {
EnvFilter::try_new("warn")
.unwrap()
.add_directive("zebra_consensus=error".parse().unwrap())
.add_directive("zebrad=error".parse().unwrap())
});
tracing_subscriber::registry()

View File

@ -21,6 +21,9 @@ use zebra_state as zs;
use crate::{config::ZebradConfig, BoxError};
mod downloads;
#[cfg(test)]
mod tests;
use downloads::{AlwaysHedge, Downloads};
/// Controls the number of peers used for each ObtainTips and ExtendTips request.
@ -130,6 +133,18 @@ pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180);
/// previous sync runs.
const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61);
/// Controls how long we wait to retry a failed attempt to download
/// and verify the genesis block.
///
/// This timeout gives the crawler time to find better peers.
///
/// ## Security
///
/// If this timeout is removed (or set too low), Zebra will immediately retry
/// to download and verify the genesis block from its peers. This can cause
/// a denial of service on those peers.
const GENESIS_TIMEOUT_RETRY: Duration = Duration::from_secs(5);
/// Helps work around defects in the bitcoin protocol by checking whether
/// the returned hashes actually extend a chain tip.
#[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
@ -615,7 +630,8 @@ where
match self.downloads.next().await.expect("downloads is nonempty") {
Ok(hash) => tracing::trace!(?hash, "verified and committed block to state"),
Err(e) => {
tracing::warn!(?e, "could not download or verify genesis block, retrying")
tracing::warn!(?e, "could not download or verify genesis block, retrying");
tokio::time::sleep(GENESIS_TIMEOUT_RETRY).await;
}
}
}
@ -665,57 +681,3 @@ where
);
}
}
#[cfg(test)]
mod test {
use zebra_chain::parameters::NetworkUpgrade;
use super::*;
/// Make sure the timeout values are consistent with each other.
#[test]
fn ensure_timeouts_consistent() {
zebra_test::init();
// This constraint clears the download pipeline during a restart
assert!(
SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Sync restart should allow for pending and buffered requests to complete"
);
// This constraint avoids spurious failures due to block retries timing out.
// We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
// seconds before retrying.
const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
assert!(
SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
"Sync restart should allow for block downloads to time out on every retry"
);
// This constraint avoids spurious failures due to block download timeouts
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> SYNC_RESTART_DELAY.as_secs()
+ BLOCK_DOWNLOAD_HEDGE_TIMEOUT
+ BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Block verify should allow for a block timeout, a sync restart, and some block fetches"
);
// The minimum recommended network speed for Zebra, in bytes per second.
const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
// This constraint avoids spurious failures when restarting large checkpoints
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
"Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
);
// This constraint avoids spurious failures after checkpointing has finished
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64,
"Block verify should allow for at least one new block to be generated and distributed"
);
}
}

View File

@ -0,0 +1 @@
mod timing;

View File

@ -0,0 +1,148 @@
use futures::future;
use std::sync::{
atomic::{AtomicU8, Ordering},
Arc,
};
use tokio::{
runtime::Runtime,
time::{timeout, Duration},
};
use super::super::*;
use crate::config::ZebradConfig;
/// Make sure the timeout values are consistent with each other.
#[test]
fn ensure_timeouts_consistent() {
zebra_test::init();
// This constraint clears the download pipeline during a restart
assert!(
SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Sync restart should allow for pending and buffered requests to complete"
);
// This constraint avoids spurious failures due to block retries timing out.
// We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
// seconds before retrying.
const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
assert!(
SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
"Sync restart should allow for block downloads to time out on every retry"
);
// This constraint avoids spurious failures due to block download timeouts
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> SYNC_RESTART_DELAY.as_secs()
+ BLOCK_DOWNLOAD_HEDGE_TIMEOUT
+ BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Block verify should allow for a block timeout, a sync restart, and some block fetches"
);
// The minimum recommended network speed for Zebra, in bytes per second.
const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
// This constraint avoids spurious failures when restarting large checkpoints
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
"Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
);
// This constraint avoids spurious failures after checkpointing has finished
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> 2 * zebra_chain::parameters::NetworkUpgrade::Blossom
.target_spacing()
.num_seconds() as u64,
"Block verify should allow for at least one new block to be generated and distributed"
);
// This constraint makes genesis retries more likely to succeed
assert!(
GENESIS_TIMEOUT_RETRY.as_secs() > zebra_network::constants::HANDSHAKE_TIMEOUT.as_secs()
&& GENESIS_TIMEOUT_RETRY.as_secs() < BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Genesis retries should wait for new peers, but they shouldn't wait too long"
);
}
/// Test that calls to [`ChainSync::request_genesis`] are rate limited.
#[test]
fn request_genesis_is_rate_limited() {
zebra_test::init();
// The number of calls to `request_genesis()` we are going to be testing for
const RETRIES_TO_RUN: u8 = 3;
// create some counters that will be updated inside async blocks
let peer_requests_counter = Arc::new(AtomicU8::new(0));
let peer_requests_counter_in_service = Arc::clone(&peer_requests_counter);
let state_requests_counter = Arc::new(AtomicU8::new(0));
let state_requests_counter_in_service = Arc::clone(&state_requests_counter);
let runtime = Runtime::new().expect("Failed to create Tokio runtime");
let _guard = runtime.enter();
// create a fake peer service that respond with `Error` to `BlocksByHash` or
// panic in any other type of request.
let peer_service = tower::service_fn(move |request| {
match request {
zebra_network::Request::BlocksByHash(_) => {
// Track the call
peer_requests_counter_in_service.fetch_add(1, Ordering::SeqCst);
// Respond with `Error`
future::err("block not found".into())
}
_ => unreachable!("no other request is allowed"),
}
});
// create a state service that respond with `None` to `Depth` or
// panic in any other type of request.
let state_service = tower::service_fn(move |request| {
match request {
zebra_state::Request::Depth(_) => {
// Track the call
state_requests_counter_in_service.fetch_add(1, Ordering::SeqCst);
// Respond with `None`
future::ok(zebra_state::Response::Depth(None))
}
_ => unreachable!("no other request is allowed"),
}
});
// create a verifier service that will always panic as it will never be called
let verifier_service =
tower::service_fn(
move |_| async move { unreachable!("no request to this service is allowed") },
);
// start the sync
let mut chain_sync = ChainSync::new(
&ZebradConfig::default(),
peer_service,
state_service,
verifier_service,
);
// run `request_genesis()` with a timeout of 13 seconds
runtime.block_on(async move {
// allow extra wall clock time for tests on CPU-bound machines
let retries_timeout = (RETRIES_TO_RUN - 1) as u64 * GENESIS_TIMEOUT_RETRY.as_secs()
+ GENESIS_TIMEOUT_RETRY.as_secs() / 2;
let _ = timeout(
Duration::from_secs(retries_timeout),
chain_sync.request_genesis(),
)
.await;
});
let peer_requests_counter = peer_requests_counter.load(Ordering::SeqCst);
assert!(peer_requests_counter >= RETRIES_TO_RUN);
assert!(peer_requests_counter <= RETRIES_TO_RUN * (BLOCK_DOWNLOAD_RETRY_LIMIT as u8) * 2);
assert_eq!(
state_requests_counter.load(Ordering::SeqCst),
RETRIES_TO_RUN
);
}