fix(sync): change default sync config to improve reliability (#4670)
* Decrease the default lookahead limit to 400 * Increase the block verification timeout to 10 minutes * Halve the default concurrent downloads config * Try to run the spawned download task before queueing the next download * Allow verification to be cancelled if the verifier is busy
This commit is contained in:
parent
257f017382
commit
c75a68e655
|
@ -57,7 +57,7 @@ const BLOCK_DOWNLOAD_RETRY_LIMIT: usize = 3;
|
||||||
|
|
||||||
/// A lower bound on the user-specified lookahead limit.
|
/// A lower bound on the user-specified lookahead limit.
|
||||||
///
|
///
|
||||||
/// Set to the maximum checkpoint interval, so the pipeline holds at least one checkpoint's
|
/// Set to the maximum checkpoint interval, so the pipeline holds around a checkpoint's
|
||||||
/// worth of blocks.
|
/// worth of blocks.
|
||||||
///
|
///
|
||||||
/// ## Security
|
/// ## Security
|
||||||
|
@ -79,7 +79,9 @@ pub const MIN_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GA
|
||||||
/// The default for the user-specified lookahead limit.
|
/// The default for the user-specified lookahead limit.
|
||||||
///
|
///
|
||||||
/// See [`MIN_LOOKAHEAD_LIMIT`] for details.
|
/// See [`MIN_LOOKAHEAD_LIMIT`] for details.
|
||||||
pub const DEFAULT_LOOKAHEAD_LIMIT: usize = zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP * 5;
|
///
|
||||||
|
/// TODO: increase to `MAX_CHECKPOINT_HEIGHT_GAP * 5`, after we implement orchard batching
|
||||||
|
pub const DEFAULT_LOOKAHEAD_LIMIT: usize = MIN_LOOKAHEAD_LIMIT;
|
||||||
|
|
||||||
/// The expected maximum number of hashes in an ObtainTips or ExtendTips response.
|
/// The expected maximum number of hashes in an ObtainTips or ExtendTips response.
|
||||||
///
|
///
|
||||||
|
@ -141,7 +143,9 @@ pub(super) const BLOCK_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(15);
|
||||||
///
|
///
|
||||||
/// If this timeout is set too low, the syncer will sometimes get stuck in a
|
/// If this timeout is set too low, the syncer will sometimes get stuck in a
|
||||||
/// failure loop.
|
/// failure loop.
|
||||||
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(6 * 60);
|
///
|
||||||
|
/// TODO: reduce to `6 * 60`, after we implement orchard batching?
|
||||||
|
pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(10 * 60);
|
||||||
|
|
||||||
/// Controls how long we wait to restart syncing after finishing a sync run.
|
/// Controls how long we wait to restart syncing after finishing a sync run.
|
||||||
///
|
///
|
||||||
|
|
|
@ -118,6 +118,15 @@ pub enum BlockDownloadVerifyError {
|
||||||
#[error("block download & verification was cancelled during download: {hash:?}")]
|
#[error("block download & verification was cancelled during download: {hash:?}")]
|
||||||
CancelledDuringDownload { hash: block::Hash },
|
CancelledDuringDownload { hash: block::Hash },
|
||||||
|
|
||||||
|
#[error(
|
||||||
|
"block download & verification was cancelled while waiting for the verifier service: \
|
||||||
|
to become ready: {height:?} {hash:?}"
|
||||||
|
)]
|
||||||
|
CancelledAwaitingVerifierReadiness {
|
||||||
|
height: block::Height,
|
||||||
|
hash: block::Hash,
|
||||||
|
},
|
||||||
|
|
||||||
#[error(
|
#[error(
|
||||||
"block download & verification was cancelled during verification: {height:?} {hash:?}"
|
"block download & verification was cancelled during verification: {height:?} {hash:?}"
|
||||||
)]
|
)]
|
||||||
|
@ -282,6 +291,7 @@ where
|
||||||
|
|
||||||
let task = tokio::spawn(
|
let task = tokio::spawn(
|
||||||
async move {
|
async move {
|
||||||
|
// Download the block.
|
||||||
// Prefer the cancel handle if both are ready.
|
// Prefer the cancel handle if both are ready.
|
||||||
let rsp = tokio::select! {
|
let rsp = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
|
@ -393,12 +403,24 @@ where
|
||||||
Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?;
|
Err(BlockDownloadVerifyError::BehindTipHeightLimit { height: block_height, hash })?;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Wait for the verifier service to be ready.
|
||||||
|
let readiness = verifier.ready();
|
||||||
|
// Prefer the cancel handle if both are ready.
|
||||||
|
let verifier = tokio::select! {
|
||||||
|
biased;
|
||||||
|
_ = &mut cancel_rx => {
|
||||||
|
trace!("task cancelled waiting for verifier service readiness");
|
||||||
|
metrics::counter!("sync.cancelled.verify.ready.count", 1);
|
||||||
|
return Err(BlockDownloadVerifyError::CancelledAwaitingVerifierReadiness { height: block_height, hash })
|
||||||
|
}
|
||||||
|
verifier = readiness => verifier,
|
||||||
|
};
|
||||||
|
|
||||||
|
// Verify the block.
|
||||||
let rsp = verifier
|
let rsp = verifier
|
||||||
.ready()
|
|
||||||
.await
|
|
||||||
.map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })?
|
.map_err(|error| BlockDownloadVerifyError::VerifierServiceError { error })?
|
||||||
.call(block);
|
.call(block);
|
||||||
// Prefer the cancel handle if both are ready.
|
|
||||||
let verification = tokio::select! {
|
let verification = tokio::select! {
|
||||||
biased;
|
biased;
|
||||||
_ = &mut cancel_rx => {
|
_ = &mut cancel_rx => {
|
||||||
|
@ -408,6 +430,7 @@ where
|
||||||
}
|
}
|
||||||
verification = rsp => verification,
|
verification = rsp => verification,
|
||||||
};
|
};
|
||||||
|
|
||||||
if verification.is_ok() {
|
if verification.is_ok() {
|
||||||
metrics::counter!("sync.verified.block.count", 1);
|
metrics::counter!("sync.verified.block.count", 1);
|
||||||
}
|
}
|
||||||
|
@ -425,6 +448,9 @@ where
|
||||||
.map_err(move |e| (e, hash)),
|
.map_err(move |e| (e, hash)),
|
||||||
);
|
);
|
||||||
|
|
||||||
|
// Try to start the spawned task before queueing the next block request
|
||||||
|
tokio::task::yield_now().await;
|
||||||
|
|
||||||
self.pending.push(task);
|
self.pending.push(task);
|
||||||
assert!(
|
assert!(
|
||||||
self.cancel_handles.insert(hash, cancel_tx).is_none(),
|
self.cancel_handles.insert(hash, cancel_tx).is_none(),
|
||||||
|
|
|
@ -166,7 +166,7 @@ impl Default for MetricsSection {
|
||||||
#[derive(Clone, Debug, Deserialize, Serialize)]
|
#[derive(Clone, Debug, Deserialize, Serialize)]
|
||||||
#[serde(deny_unknown_fields, default)]
|
#[serde(deny_unknown_fields, default)]
|
||||||
pub struct SyncSection {
|
pub struct SyncSection {
|
||||||
/// The maximum number of concurrent block requests during sync.
|
/// The maximum number of concurrent block download requests during sync.
|
||||||
///
|
///
|
||||||
/// This is set to a low value by default, to avoid task and
|
/// This is set to a low value by default, to avoid task and
|
||||||
/// network contention. Increasing this value may improve
|
/// network contention. Increasing this value may improve
|
||||||
|
@ -178,22 +178,27 @@ pub struct SyncSection {
|
||||||
/// download before waiting for queued verifications to complete.
|
/// download before waiting for queued verifications to complete.
|
||||||
///
|
///
|
||||||
/// Increasing this limit increases the buffer size, so it reduces
|
/// Increasing this limit increases the buffer size, so it reduces
|
||||||
/// the impact of an individual block request failing. The block
|
/// the impact of an individual block request failing. However, it
|
||||||
/// size limit is 2MB, so in theory, this could represent multiple
|
/// also increases memory and CPU usage if block validation stalls,
|
||||||
|
/// or there are some large blocks in the pipeline.
|
||||||
|
///
|
||||||
|
/// The block size limit is 2MB, so in theory, this could represent multiple
|
||||||
/// gigabytes of data, if we downloaded arbitrary blocks. However,
|
/// gigabytes of data, if we downloaded arbitrary blocks. However,
|
||||||
/// because we randomly load balance outbound requests, and separate
|
/// because we randomly load balance outbound requests, and separate
|
||||||
/// block download from obtaining block hashes, an adversary would
|
/// block download from obtaining block hashes, an adversary would
|
||||||
/// have to control a significant fraction of our peers to lead us
|
/// have to control a significant fraction of our peers to lead us
|
||||||
/// astray.
|
/// astray.
|
||||||
///
|
///
|
||||||
/// This value is clamped to an implementation-defined lower bound.
|
/// For reliable checkpoint syncing, Zebra enforces a
|
||||||
|
/// [`MIN_LOOKAHEAD_LIMIT`](sync::MIN_LOOKAHEAD_LIMIT).
|
||||||
pub lookahead_limit: usize,
|
pub lookahead_limit: usize,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SyncSection {
|
impl Default for SyncSection {
|
||||||
fn default() -> Self {
|
fn default() -> Self {
|
||||||
Self {
|
Self {
|
||||||
max_concurrent_block_requests: 50,
|
// TODO: increase to 50, after we implement orchard batching
|
||||||
|
max_concurrent_block_requests: 25,
|
||||||
lookahead_limit: sync::DEFAULT_LOOKAHEAD_LIMIT,
|
lookahead_limit: sync::DEFAULT_LOOKAHEAD_LIMIT,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue