Rate limit initial genesis block download retries, Credit: Equilibrium (#2255)

* implement and test a rate limit in `request_genesis()` * add `request_genesis_is_rate_limited` test to sync * add ensure_timeouts constraint for GENESIS_TIMEOUT_RETRY * Suppress expected warning logs in zebrad tests Co-authored-by: teor <teor@riseup.net>
2021-06-09 20:39:51 -03:00 · 2021-06-09 20:39:51 -03:00 · 96a1b661f0
parent 53e50821ec
commit 96a1b661f0
4 changed files with 167 additions and 55 deletions
--- a/zebra-test/src/lib.rs
+++ b/zebra-test/src/lib.rs
@ -37,6 +37,7 @@ pub fn init() {
            EnvFilter::try_new("warn")
                .unwrap()
                .add_directive("zebra_consensus=error".parse().unwrap())
+                .add_directive("zebrad=error".parse().unwrap())
        });

        tracing_subscriber::registry()
--- a/zebrad/src/components/sync.rs
+++ b/zebrad/src/components/sync.rs
@ -21,6 +21,9 @@ use zebra_state as zs;
 use crate::{config::ZebradConfig, BoxError};

 mod downloads;
+#[cfg(test)]
+mod tests;
+
 use downloads::{AlwaysHedge, Downloads};

 /// Controls the number of peers used for each ObtainTips and ExtendTips request.
@ -130,6 +133,18 @@ pub(super) const BLOCK_VERIFY_TIMEOUT: Duration = Duration::from_secs(180);
 /// previous sync runs.
 const SYNC_RESTART_DELAY: Duration = Duration::from_secs(61);

+/// Controls how long we wait to retry a failed attempt to download
+/// and verify the genesis block.
+///
+/// This timeout gives the crawler time to find better peers.
+///
+/// ## Security
+///
+/// If this timeout is removed (or set too low), Zebra will immediately retry
+/// to download and verify the genesis block from its peers. This can cause
+/// a denial of service on those peers.
+const GENESIS_TIMEOUT_RETRY: Duration = Duration::from_secs(5);
+
 /// Helps work around defects in the bitcoin protocol by checking whether
 /// the returned hashes actually extend a chain tip.
 #[derive(Copy, Clone, Debug, Hash, PartialEq, Eq)]
@ -615,7 +630,8 @@ where
            match self.downloads.next().await.expect("downloads is nonempty") {
                Ok(hash) => tracing::trace!(?hash, "verified and committed block to state"),
                Err(e) => {
-                    tracing::warn!(?e, "could not download or verify genesis block, retrying")
+                    tracing::warn!(?e, "could not download or verify genesis block, retrying");
+                    tokio::time::sleep(GENESIS_TIMEOUT_RETRY).await;
                }
            }
        }
@ -665,57 +681,3 @@ where
        );
    }
 }
-
-#[cfg(test)]
-mod test {
-    use zebra_chain::parameters::NetworkUpgrade;
-
-    use super::*;
-
-    /// Make sure the timeout values are consistent with each other.
-    #[test]
-    fn ensure_timeouts_consistent() {
-        zebra_test::init();
-
-        // This constraint clears the download pipeline during a restart
-        assert!(
-            SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
-            "Sync restart should allow for pending and buffered requests to complete"
-        );
-
-        // This constraint avoids spurious failures due to block retries timing out.
-        // We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
-        // seconds before retrying.
-        const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
-            2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
-        assert!(
-            SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
-            "Sync restart should allow for block downloads to time out on every retry"
-        );
-
-        // This constraint avoids spurious failures due to block download timeouts
-        assert!(
-            BLOCK_VERIFY_TIMEOUT.as_secs()
-                > SYNC_RESTART_DELAY.as_secs()
-                    + BLOCK_DOWNLOAD_HEDGE_TIMEOUT
-                    + BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
-            "Block verify should allow for a block timeout, a sync restart, and some block fetches"
-        );
-
-        // The minimum recommended network speed for Zebra, in bytes per second.
-        const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
-
-        // This constraint avoids spurious failures when restarting large checkpoints
-        assert!(
-            BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
-            "Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
-        );
-
-        // This constraint avoids spurious failures after checkpointing has finished
-        assert!(
-            BLOCK_VERIFY_TIMEOUT.as_secs()
-                > 2 * NetworkUpgrade::Blossom.target_spacing().num_seconds() as u64,
-            "Block verify should allow for at least one new block to be generated and distributed"
-        );
-    }
-}
--- a/zebrad/src/components/sync/tests.rs
+++ b/zebrad/src/components/sync/tests.rs
@ -0,0 +1 @@
+mod timing;
--- a/zebrad/src/components/sync/tests/timing.rs
+++ b/zebrad/src/components/sync/tests/timing.rs
@ -0,0 +1,148 @@
+use futures::future;
+use std::sync::{
+    atomic::{AtomicU8, Ordering},
+    Arc,
+};
+use tokio::{
+    runtime::Runtime,
+    time::{timeout, Duration},
+};
+
+use super::super::*;
+use crate::config::ZebradConfig;
+
+/// Make sure the timeout values are consistent with each other.
+#[test]
+fn ensure_timeouts_consistent() {
+    zebra_test::init();
+
+    // This constraint clears the download pipeline during a restart
+    assert!(
+        SYNC_RESTART_DELAY.as_secs() > 2 * BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
+        "Sync restart should allow for pending and buffered requests to complete"
+    );
+
+    // This constraint avoids spurious failures due to block retries timing out.
+    // We multiply by 2, because the Hedge can wait up to BLOCK_DOWNLOAD_TIMEOUT
+    // seconds before retrying.
+    const BLOCK_DOWNLOAD_HEDGE_TIMEOUT: u64 =
+        2 * BLOCK_DOWNLOAD_RETRY_LIMIT as u64 * BLOCK_DOWNLOAD_TIMEOUT.as_secs();
+    assert!(
+        SYNC_RESTART_DELAY.as_secs() > BLOCK_DOWNLOAD_HEDGE_TIMEOUT,
+        "Sync restart should allow for block downloads to time out on every retry"
+    );
+
+    // This constraint avoids spurious failures due to block download timeouts
+    assert!(
+        BLOCK_VERIFY_TIMEOUT.as_secs()
+            > SYNC_RESTART_DELAY.as_secs()
+                + BLOCK_DOWNLOAD_HEDGE_TIMEOUT
+                + BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
+        "Block verify should allow for a block timeout, a sync restart, and some block fetches"
+    );
+
+    // The minimum recommended network speed for Zebra, in bytes per second.
+    const MIN_NETWORK_SPEED_BYTES_PER_SEC: u64 = 10 * 1024 * 1024 / 8;
+
+    // This constraint avoids spurious failures when restarting large checkpoints
+    assert!(
+       BLOCK_VERIFY_TIMEOUT.as_secs() > SYNC_RESTART_DELAY.as_secs() + 2 * zebra_consensus::MAX_CHECKPOINT_BYTE_COUNT / MIN_NETWORK_SPEED_BYTES_PER_SEC,
+       "Block verify should allow for a full checkpoint download, a sync restart, then a full checkpoint re-download"
+    );
+
+    // This constraint avoids spurious failures after checkpointing has finished
+    assert!(
+        BLOCK_VERIFY_TIMEOUT.as_secs()
+            > 2 * zebra_chain::parameters::NetworkUpgrade::Blossom
+                .target_spacing()
+                .num_seconds() as u64,
+        "Block verify should allow for at least one new block to be generated and distributed"
+    );
+
+    // This constraint makes genesis retries more likely to succeed
+    assert!(
+        GENESIS_TIMEOUT_RETRY.as_secs() > zebra_network::constants::HANDSHAKE_TIMEOUT.as_secs()
+            && GENESIS_TIMEOUT_RETRY.as_secs() < BLOCK_DOWNLOAD_TIMEOUT.as_secs(),
+        "Genesis retries should wait for new peers, but they shouldn't wait too long"
+    );
+}
+
+/// Test that calls to [`ChainSync::request_genesis`] are rate limited.
+#[test]
+fn request_genesis_is_rate_limited() {
+    zebra_test::init();
+
+    // The number of calls to `request_genesis()` we are going to be testing for
+    const RETRIES_TO_RUN: u8 = 3;
+
+    // create some counters that will be updated inside async blocks
+    let peer_requests_counter = Arc::new(AtomicU8::new(0));
+    let peer_requests_counter_in_service = Arc::clone(&peer_requests_counter);
+    let state_requests_counter = Arc::new(AtomicU8::new(0));
+    let state_requests_counter_in_service = Arc::clone(&state_requests_counter);
+
+    let runtime = Runtime::new().expect("Failed to create Tokio runtime");
+    let _guard = runtime.enter();
+
+    // create a fake peer service that respond with `Error` to `BlocksByHash` or
+    // panic in any other type of request.
+    let peer_service = tower::service_fn(move |request| {
+        match request {
+            zebra_network::Request::BlocksByHash(_) => {
+                // Track the call
+                peer_requests_counter_in_service.fetch_add(1, Ordering::SeqCst);
+                // Respond with `Error`
+                future::err("block not found".into())
+            }
+            _ => unreachable!("no other request is allowed"),
+        }
+    });
+
+    // create a state service that respond with `None` to `Depth` or
+    // panic in any other type of request.
+    let state_service = tower::service_fn(move |request| {
+        match request {
+            zebra_state::Request::Depth(_) => {
+                // Track the call
+                state_requests_counter_in_service.fetch_add(1, Ordering::SeqCst);
+                // Respond with `None`
+                future::ok(zebra_state::Response::Depth(None))
+            }
+            _ => unreachable!("no other request is allowed"),
+        }
+    });
+
+    // create a verifier service that will always panic as it will never be called
+    let verifier_service =
+        tower::service_fn(
+            move |_| async move { unreachable!("no request to this service is allowed") },
+        );
+
+    // start the sync
+    let mut chain_sync = ChainSync::new(
+        &ZebradConfig::default(),
+        peer_service,
+        state_service,
+        verifier_service,
+    );
+
+    // run `request_genesis()` with a timeout of 13 seconds
+    runtime.block_on(async move {
+        // allow extra wall clock time for tests on CPU-bound machines
+        let retries_timeout = (RETRIES_TO_RUN - 1) as u64 * GENESIS_TIMEOUT_RETRY.as_secs()
+            + GENESIS_TIMEOUT_RETRY.as_secs() / 2;
+        let _ = timeout(
+            Duration::from_secs(retries_timeout),
+            chain_sync.request_genesis(),
+        )
+        .await;
+    });
+
+    let peer_requests_counter = peer_requests_counter.load(Ordering::SeqCst);
+    assert!(peer_requests_counter >= RETRIES_TO_RUN);
+    assert!(peer_requests_counter <= RETRIES_TO_RUN * (BLOCK_DOWNLOAD_RETRY_LIMIT as u8) * 2);
+    assert_eq!(
+        state_requests_counter.load(Ordering::SeqCst),
+        RETRIES_TO_RUN
+    );
+}