fix(sync): change default sync config to improve reliability (#4670)

* Decrease the default lookahead limit to 400

* Increase the block verification timeout to 10 minutes

* Halve the default concurrent downloads config

* Try to run the spawned download task before queueing the next download

* Allow verification to be cancelled if the verifier is busy
This commit is contained in:
teor 2022-06-23 04:17:21 +10:00 committed by GitHub
parent 257f017382
commit c75a68e655
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 46 additions and 11 deletions

View File

@ -57,7 +57,7 @@ const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 3;
/// A lower bound on the user-specified lookahead limit. /// A lower bound on the user-specified lookahead limit.
/// ///
/// Set to the maximum checkpoint interval, so the pipeline holds at least one checkpoint's /// Set to the maximum checkpoint interval, so the pipeline holds around a checkpoint's
/// worth of blocks. /// worth of blocks.
/// ///
/// ## Security /// ## Security
@ -79,7 +79,9 @@ pub const MIN_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GA
/// The default for the user-specified lookahead limit. /// The default for the user-specified lookahead limit.
/// ///
/// See [`MIN_LOOKAHEAD_LIMIT`] for details. /// See [`MIN_LOOKAHEAD_LIMIT`] for details.
pub const DEFAULT_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP * 5; ///
/// TODO: increase to `MAX_CHECKPOINT_HEIGHT_GAP * 5`, after we implement orchard batching
pub const DEFAULT_LOOKAHEAD_LIMIT: usize = MIN_LOOKAHEAD_LIMIT;
/// The expected maximum number of hashes in an ObtainTips or ExtendTips response. /// The expected maximum number of hashes in an ObtainTips or ExtendTips response.
/// ///
@ -141,7 +143,9 @@ pub(super) const BLOCK_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(15);
/// ///
/// If this timeout is set too low, the syncer will sometimes get stuck in a /// If this timeout is set too low, the syncer will sometimes get stuck in a
/// failure loop. /// failure loop.
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(6 * 60); ///
/// TODO: reduce to `6 * 60`, after we implement orchard batching?
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
/// Controls how long we wait to restart syncing after finishing a sync run. /// Controls how long we wait to restart syncing after finishing a sync run.
/// ///

View File

@ -118,6 +118,15 @@ pub enum BlockDownloadVerifyError {
#[error("block download & verification was cancelled during download: {hash:?}")] #[error("block download & verification was cancelled during download: {hash:?}")]
CancelledDuringDownload { hash: block::Hash }, CancelledDuringDownload { hash: block::Hash },
#[error(
"block download & verification was cancelled while waiting for the verifier service: \
to become ready: {height:?} {hash:?}"
)]
CancelledAwaitingVerifierReadiness {
height: block::Height,
hash: block::Hash,
},
#[error( #[error(
"block download & verification was cancelled during verification: {height:?} {hash:?}" "block download & verification was cancelled during verification: {height:?} {hash:?}"
)] )]
@ -282,6 +291,7 @@ where
let task = tokio::spawn( let task = tokio::spawn(
async move { async move {
// Download the block.
// Prefer the cancel handle if both are ready. // Prefer the cancel handle if both are ready.
let rsp = tokio::select! { let rsp = tokio::select! {
biased; biased;
@ -393,12 +403,24 @@ where
Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?; Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?;
} }
// Wait for the verifier service to be ready.
let readiness = verifier.ready();
// Prefer the cancel handle if both are ready.
let verifier = tokio::select! {
biased;
_ = &mut cancel_rx => {
trace!("task cancelled waiting for verifier service readiness");
metrics::counter!("sync.cancelled.verify.ready.count", 1);
return Err(BlockDownloadVerifyError::CancelledAwaitingVerifierReadiness { height: block_height, hash })
}
verifier = readiness => verifier,
};
// Verify the block.
let rsp = verifier let rsp = verifier
.ready()
.await
.map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })? .map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })?
.call(block); .call(block);
// Prefer the cancel handle if both are ready.
let verification = tokio::select! { let verification = tokio::select! {
biased; biased;
_ = &mut cancel_rx => { _ = &mut cancel_rx => {
@ -408,6 +430,7 @@ where
} }
verification = rsp => verification, verification = rsp => verification,
}; };
if verification.is_ok() { if verification.is_ok() {
metrics::counter!("sync.verified.block.count", 1); metrics::counter!("sync.verified.block.count", 1);
} }
@ -425,6 +448,9 @@ where
.map_err(move |e| (e, hash)), .map_err(move |e| (e, hash)),
); );
// Try to start the spawned task before queueing the next block request
tokio::task::yield_now().await;
self.pending.push(task); self.pending.push(task);
assert!( assert!(
self.cancel_handles.insert(hash, cancel_tx).is_none(), self.cancel_handles.insert(hash, cancel_tx).is_none(),

View File

@ -166,7 +166,7 @@ impl Default for MetricsSection {
#[derive(Clone, Debug, Deserialize, Serialize)] #[derive(Clone, Debug, Deserialize, Serialize)]
#[serde(deny_unknown_fields, default)] #[serde(deny_unknown_fields, default)]
pub struct SyncSection { pub struct SyncSection {
/// The maximum number of concurrent block requests during sync. /// The maximum number of concurrent block download requests during sync.
/// ///
/// This is set to a low value by default, to avoid task and /// This is set to a low value by default, to avoid task and
/// network contention. Increasing this value may improve /// network contention. Increasing this value may improve
@ -178,22 +178,27 @@ pub struct SyncSection {
/// download before waiting for queued verifications to complete. /// download before waiting for queued verifications to complete.
/// ///
/// Increasing this limit increases the buffer size, so it reduces /// Increasing this limit increases the buffer size, so it reduces
/// the impact of an individual block request failing. The block /// the impact of an individual block request failing. However, it
/// size limit is 2MB, so in theory, this could represent multiple /// also increases memory and CPU usage if block validation stalls,
/// or there are some large blocks in the pipeline.
///
/// The block size limit is 2MB, so in theory, this could represent multiple
/// gigabytes of data, if we downloaded arbitrary blocks. However, /// gigabytes of data, if we downloaded arbitrary blocks. However,
/// because we randomly load balance outbound requests, and separate /// because we randomly load balance outbound requests, and separate
/// block download from obtaining block hashes, an adversary would /// block download from obtaining block hashes, an adversary would
/// have to control a significant fraction of our peers to lead us /// have to control a significant fraction of our peers to lead us
/// astray. /// astray.
/// ///
/// This value is clamped to an implementation-defined lower bound. /// For reliable checkpoint syncing, Zebra enforces a
/// [`MIN_LOOKAHEAD_LIMIT`](sync::MIN_LOOKAHEAD_LIMIT).
pub lookahead_limit: usize, pub lookahead_limit: usize,
} }
impl Default for SyncSection { impl Default for SyncSection {
fn default() -> Self { fn default() -> Self {
Self { Self {
max_concurrent_block_requests: 50, // TODO: increase to 50, after we implement orchard batching
max_concurrent_block_requests: 25,
lookahead_limit: sync::DEFAULT_LOOKAHEAD_LIMIT, lookahead_limit: sync::DEFAULT_LOOKAHEAD_LIMIT,
} }
} }