Add some additional sync correctness constraints

And adjust the sync restart delay as a consequence.
This commit is contained in:
teor 2021-01-13 19:10:48 +10:00 committed by Deirdre Connolly
parent cef0a492d8
commit 3699bbdae6
4 changed files with 61 additions and 14 deletions

View File

@ -88,6 +88,14 @@ pub const MAX_QUEUED_BLOCKS_PER_HEIGHT: usize = 4;
/// bugs. So the most efficient gap is slightly less than 500 blocks.
pub const MAX_CHECKPOINT_HEIGHT_GAP: usize = 400;
/// We limit the memory usage and download contention for each checkpoint,
/// based on the cumulative size of the serialized blocks in the chain.
///
/// Deserialized blocks (in memory) are slightly larger than serialized blocks
/// (on the network or disk). But they should be within a constant factor of the
/// serialized size.
pub const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
/// A checkpointing block verifier.
///
/// Verifies blocks using a supplied list of checkpoints. There must be at

View File

@ -53,6 +53,7 @@ mod transaction;
pub mod chain;
pub mod error;
pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT;
pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP;
pub use config::Config;

View File

@ -24,12 +24,6 @@ use std::os::unix::process::ExitStatusExt;
mod args;
/// We limit the memory usage for each checkpoint, based on the cumulative size of
/// the serialized blocks in the chain. Deserialized blocks are slightly larger
/// than serialized blocks, but they should be within a constant factor of the
/// serialized size.
const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
/// Initialise tracing using its defaults.
fn init_tracing() {
tracing_subscriber::Registry::default()
@ -135,7 +129,7 @@ fn main() -> Result<()> {
// check if checkpoint
if height == block::Height(0)
|| cumulative_bytes >= MAX_CHECKPOINT_BYTE_COUNT
|| cumulative_bytes >= zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT
|| height_gap.0 >= zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP as u32
{
// print to output

View File

@ -88,16 +88,22 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180);
/// Controls how long we wait to restart syncing after finishing a sync run.
///
/// This timeout should be long enough to:
/// This delay should be long enough to:
/// - allow zcashd peers to process pending requests. If the node only has a
/// few peers, we want to clear as much peer state as possible. In
/// particular, zcashd sends "next block range" hints, based on zcashd's
/// internal model of our sync progress. But we want to discard these hints,
/// so they don't get confused with ObtainTips and ExtendTips responses.
///
/// This timeout is particularly important on instances with slow or unreliable
/// This delay is particularly important on instances with slow or unreliable
/// networks, and on testnet, which has a small number of slow peers.
const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(45);
///
/// ## Correctness
///
/// If this delay is removed (or set too low), the syncer will
/// sometimes get stuck in a failure loop, due to leftover downloads from
/// previous sync runs.
const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61);
type BoxError = Box<dyn std::error::Error + Send + Sync + 'static>;
@ -178,7 +184,7 @@ where
AlwaysHedge,
20,
0.95,
2 * SYNC_RESTART_TIMEOUT,
2 * SYNC_RESTART_DELAY,
);
// We apply a timeout to the verifier to avoid hangs due to missing earlier blocks.
let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT);
@ -204,11 +210,11 @@ where
'sync: loop {
if started_once {
tracing::info!(timeout = ?SYNC_RESTART_TIMEOUT, "waiting to restart sync");
tracing::info!(timeout = ?SYNC_RESTART_DELAY, "waiting to restart sync");
self.prospective_tips = HashSet::new();
self.downloads.cancel_all();
self.update_metrics();
sleep(SYNC_RESTART_TIMEOUT).await;
sleep(SYNC_RESTART_DELAY).await;
} else {
started_once = true;
}
@ -613,6 +619,8 @@ where
#[cfg(test)]
mod test {
use zebra_chain::parameters::NetworkUpgrade;
use super::*;
/// Make sure the timeout values are consistent with each other.
@ -620,9 +628,45 @@ mod test {
fn ensure_timeouts_consistent() {
zebra_test::init();
// This constraint clears the download pipeline during a restart
assert!(
BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(),
SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Sync restart should allow for pending and buffered requests to complete"
);
// This constraint avoids spurious failures due to block retries timing out.
// We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
// seconds before retrying.
const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
assert!(
SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
"Sync restart should allow for block downloads to time out on every retry"
);
// This constraint avoids spurious failures due to block download timeouts
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> SYNC_RESTART_DELAY.as_secs()
+ BLOCK_DOWNLOAD_HEDGE_TIMEOUT
+ BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
"Block verify should allow for a block timeout, a sync restart, and some block fetches"
);
// The minimum recommended network speed for Zebra, in bytes per second.
const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
// This constraint avoids spurious failures when restarting large checkpoints
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
"Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
);
// This constraint avoids spurious failures after checkpointing has finished
assert!(
BLOCK_VERIFY_TIMEOUT.as_secs()
> 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64,
"Block verify should allow for at least one new block to be generated and distributed"
);
}
}