Add some additional sync correctness constraints

And adjust the sync restart delay as a consequence.
2021-01-13 19:10:48 +10:00 · 2021-01-13 19:10:48 +10:00 · 3699bbdae6
parent cef0a492d8
commit 3699bbdae6
4 changed files with 61 additions and 14 deletions
--- a/zebra-consensus/src/checkpoint.rs
+++ b/zebra-consensus/src/checkpoint.rs
@ -88,6 +88,14 @@ pub const MAX_QUEUED_BLOCKS_PER_HEIGHT: usize = 4;
 /// bugs. So the most efficient gap is slightly less than 500 blocks.
 pub const MAX_CHECKPOINT_HEIGHT_GAP: usize = 400;

+/// We limit the memory usage and download contention for each checkpoint,
+/// based on the cumulative size of the serialized blocks in the chain.
+///
+/// Deserialized blocks (in memory) are slightly larger than serialized blocks
+/// (on the network or disk). But they should be within a constant factor of the
+/// serialized size.
+pub const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
+
 /// A checkpointing block verifier.
 ///
 /// Verifies blocks using a supplied list of checkpoints. There must be at
--- a/zebra-consensus/src/lib.rs
+++ b/zebra-consensus/src/lib.rs
@ -53,6 +53,7 @@ mod transaction;
 pub mod chain;
 pub mod error;

+pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT;
 pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP;
 pub use config::Config;

--- a/zebra-utils/src/bin/zebra-checkpoints/main.rs
+++ b/zebra-utils/src/bin/zebra-checkpoints/main.rs
@ -24,12 +24,6 @@ use std::os::unix::process::ExitStatusExt;

 mod args;

-/// We limit the memory usage for each checkpoint, based on the cumulative size of
-/// the serialized blocks in the chain. Deserialized blocks are slightly larger
-/// than serialized blocks, but they should be within a constant factor of the
-/// serialized size.
-const MAX_CHECKPOINT_BYTE_COUNT: u64 = 32 * 1024 * 1024;
-
 /// Initialise tracing using its defaults.
 fn init_tracing() {
    tracing_subscriber::Registry::default()
@ -135,7 +129,7 @@ fn main() -> Result<()> {

        // check if checkpoint
        if height == block::Height(0)
-            || cumulative_bytes >= MAX_CHECKPOINT_BYTE_COUNT
+            || cumulative_bytes >= zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT
            || height_gap.0 >= zebra_consensus::MAX_CHECKPOINT_HEIGHT_GAP as u32
        {
            // print to output
--- a/zebrad/src/components/sync.rs
+++ b/zebrad/src/components/sync.rs
@ -88,16 +88,22 @@ const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180);

 /// Controls how long we wait to restart syncing after finishing a sync run.
 ///
-/// This timeout should be long enough to:
+/// This delay should be long enough to:
 ///   - allow zcashd peers to process pending requests. If the node only has a
 ///     few peers, we want to clear as much peer state as possible. In
 ///     particular, zcashd sends "next block range" hints, based on zcashd's
 ///     internal model of our sync progress. But we want to discard these hints,
 ///     so they don't get confused with ObtainTips and ExtendTips responses.
 ///
-/// This timeout is particularly important on instances with slow or unreliable
+/// This delay is particularly important on instances with slow or unreliable
 /// networks, and on testnet, which has a small number of slow peers.
-const SYNC_RESTART_TIMEOUT: Duration = Duration::from_secs(45);
+///
+/// ## Correctness
+///
+/// If this delay is removed (or set too low), the syncer will
+/// sometimes get stuck in a failure loop, due to leftover downloads from
+/// previous sync runs.
+const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61);

 type BoxError = Box<dyn std::error::Error + Send + Sync + 'static>;

@ -178,7 +184,7 @@ where
            AlwaysHedge,
            20,
            0.95,
-            2 * SYNC_RESTART_TIMEOUT,
+            2 * SYNC_RESTART_DELAY,
        );
        // We apply a timeout to the verifier to avoid hangs due to missing earlier blocks.
        let verifier = Timeout::new(verifier, BLOCK_VERIFY_TIMEOUT);
@ -204,11 +210,11 @@ where

        'sync: loop {
            if started_once {
-                tracing::info!(timeout = ?SYNC_RESTART_TIMEOUT, "waiting to restart sync");
+                tracing::info!(timeout = ?SYNC_RESTART_DELAY, "waiting to restart sync");
                self.prospective_tips = HashSet::new();
                self.downloads.cancel_all();
                self.update_metrics();
-                sleep(SYNC_RESTART_TIMEOUT).await;
+                sleep(SYNC_RESTART_DELAY).await;
            } else {
                started_once = true;
            }
@ -613,6 +619,8 @@ where

 #[cfg(test)]
 mod test {
+    use zebra_chain::parameters::NetworkUpgrade;
+
    use super::*;

    /// Make sure the timeout values are consistent with each other.
@ -620,9 +628,45 @@ mod test {
    fn ensure_timeouts_consistent() {
        zebra_test::init();

+        // This constraint clears the download pipeline during a restart
        assert!(
-            BLOCK_DOWNLOAD_TIMEOUT.as_secs() * 2 < SYNC_RESTART_TIMEOUT.as_secs(),
+            SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
            "Sync restart should allow for pending and buffered requests to complete"
        );
+
+        // This constraint avoids spurious failures due to block retries timing out.
+        // We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
+        // seconds before retrying.
+        const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
+            2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
+        assert!(
+            SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
+            "Sync restart should allow for block downloads to time out on every retry"
+        );
+
+        // This constraint avoids spurious failures due to block download timeouts
+        assert!(
+            BLOCK_VERIFY_TIMEOUT.as_secs()
+                > SYNC_RESTART_DELAY.as_secs()
+                    + BLOCK_DOWNLOAD_HEDGE_TIMEOUT
+                    + BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
+            "Block verify should allow for a block timeout, a sync restart, and some block fetches"
+        );
+
+        // The minimum recommended network speed for Zebra, in bytes per second.
+        const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
+
+        // This constraint avoids spurious failures when restarting large checkpoints
+        assert!(
+            BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
+            "Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
+        );
+
+        // This constraint avoids spurious failures after checkpointing has finished
+        assert!(
+            BLOCK_VERIFY_TIMEOUT.as_secs()
+                > 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64,
+            "Block verify should allow for at least one new block to be generated and distributed"
+        );
    }
 }