fix(net): Fix potential network hangs, and reduce code complexity (#7859)

* Refactor return type of poll_discover() * Simplify poll_ready() by removing preselected peers * Fix peer set readiness check * Pass task context correctly to background tasks * Make poll_discover() return Pending * Make poll_inventory() return Pending * Make poll_unready() return Poll::Pending * Simplify with futures::ready!() and ? * When there are no peers, wake on newly ready peers, or new peers, in that order * Preserve the original waker when there are no unready peers * Fix polling docs and remove unnecessary code * Make sure we're ignoring Poll::Pending not Result * Make panic checking method names clearer * Fix connection Client task wakeups and error handling * Cleanup connection panic handling and add wakeup docs * Fix connection client task wakeups to prevent hangs * Simplify error and pending handling * Clarify inventory set behaviour * Define peer set poll_* methods so they return Ok if they do something * Clarify documentation Co-authored-by: Arya <aryasolhi@gmail.com> * Fix test that depended on preselected peers * Check ready peers for errors before sending requests to them * Fix a hanging test by not waiting for irrelevant actions * Only remove cancel handles when they are required * fix incorrect panic on termination setting * Clarify method comments Co-authored-by: Arya <aryasolhi@gmail.com> --------- Co-authored-by: Arya <aryasolhi@gmail.com>
2023-11-17 05:53:24 +10:00 · 2023-11-17 05:53:24 +10:00 · d689e7344b
parent a22c8d5f42
commit d689e7344b
12 changed files with 593 additions and 272 deletions
--- a/zebra-chain/src/diagnostic/task.rs
+++ b/zebra-chain/src/diagnostic/task.rs
@ -9,7 +9,7 @@ pub mod future;
 pub mod thread;

 /// A trait that checks a task's return value for panics.
-pub trait CheckForPanics {
+pub trait CheckForPanics: Sized {
    /// The output type, after removing panics from `Self`.
    type Output;

@ -20,11 +20,34 @@ pub trait CheckForPanics {
    ///
    /// If `self` contains a panic payload or an unexpected termination.
    #[track_caller]
-    fn check_for_panics(self) -> Self::Output;
+    fn panic_if_task_has_finished(self) -> Self::Output {
+        self.check_for_panics_with(true)
+    }
+
+    /// Check if `self` contains a panic payload, then panic.
+    /// Otherwise, return the non-panic part of `self`.
+    ///
+    /// # Panics
+    ///
+    /// If `self` contains a panic payload.
+    #[track_caller]
+    fn panic_if_task_has_panicked(self) -> Self::Output {
+        self.check_for_panics_with(false)
+    }
+
+    /// Check if `self` contains a panic payload, then panic. Also panics if
+    /// `panic_on_unexpected_termination` is true, and `self` is an unexpected termination.
+    /// Otherwise, return the non-panic part of `self`.
+    ///
+    /// # Panics
+    ///
+    /// If `self` contains a panic payload, or if we're panicking on unexpected terminations.
+    #[track_caller]
+    fn check_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output;
 }

 /// A trait that waits for a task to finish, then handles panics and cancellations.
-pub trait WaitForPanics {
+pub trait WaitForPanics: Sized {
    /// The underlying task output, after removing panics and unwrapping termination results.
    type Output;

@ -43,5 +66,44 @@ pub trait WaitForPanics {
    ///
    /// If `self` contains an expected termination, and we're shutting down anyway.
    #[track_caller]
-    fn wait_for_panics(self) -> Self::Output;
+    fn wait_and_panic_on_unexpected_termination(self) -> Self::Output {
+        self.wait_for_panics_with(true)
+    }
+
+    /// Waits for `self` to finish, then check if its output is:
+    /// - a panic payload: resume that panic,
+    /// - a task termination: hang waiting for shutdown.
+    ///
+    /// Otherwise, returns the task return value of `self`.
+    ///
+    /// # Panics
+    ///
+    /// If `self` contains a panic payload.
+    ///
+    /// # Hangs
+    ///
+    /// If `self` contains a task termination.
+    #[track_caller]
+    fn wait_for_panics(self) -> Self::Output {
+        self.wait_for_panics_with(false)
+    }
+
+    /// Waits for `self` to finish, then check if its output is:
+    /// - a panic payload: resume that panic,
+    /// - an unexpected termination:
+    ///   - if `panic_on_unexpected_termination` is true, panic with that error,
+    ///   - otherwise, hang waiting for shutdown,
+    /// - an expected termination: hang waiting for shutdown.
+    ///
+    /// Otherwise, returns the task return value of `self`.
+    ///
+    /// # Panics
+    ///
+    /// If `self` contains a panic payload, or if we're panicking on unexpected terminations.
+    ///
+    /// # Hangs
+    ///
+    /// If `self` contains an expected or ignored termination, and we're shutting down anyway.
+    #[track_caller]
+    fn wait_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output;
 }
--- a/zebra-chain/src/diagnostic/task/future.rs
+++ b/zebra-chain/src/diagnostic/task/future.rs
@ -18,15 +18,18 @@ impl<T> CheckForPanics for Result<T, JoinError> {
    type Output = Result<T, JoinError>;

    /// Returns the task result if the task finished normally.
-    /// Otherwise, resumes any panics, logs unexpected errors, and ignores any expected errors.
+    /// Otherwise, resumes any panics, and ignores any expected errors.
+    /// Handles unexpected errors based on `panic_on_unexpected_termination`.
    ///
    /// If the task finished normally, returns `Some(T)`.
    /// If the task was cancelled, returns `None`.
    #[track_caller]
-    fn check_for_panics(self) -> Self::Output {
+    fn check_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        match self {
            Ok(task_output) => Ok(task_output),
-            Err(join_error) => Err(join_error.check_for_panics()),
+            Err(join_error) => {
+                Err(join_error.check_for_panics_with(panic_on_unexpected_termination))
+            }
        }
    }
 }
@ -38,22 +41,26 @@ impl CheckForPanics for JoinError {
    /// Resume any panics and panic on unexpected task cancellations.
    /// Always returns [`JoinError::Cancelled`](JoinError::is_cancelled).
    #[track_caller]
-    fn check_for_panics(self) -> Self::Output {
+    fn check_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        match self.try_into_panic() {
            Ok(panic_payload) => panic::resume_unwind(panic_payload),

            // We could ignore this error, but then we'd have to change the return type.
-            Err(task_cancelled) if is_shutting_down() => {
-                debug!(
-                    ?task_cancelled,
-                    "ignoring cancelled task because Zebra is shutting down"
-                );
-
-                task_cancelled
-            }
-
            Err(task_cancelled) => {
-                panic!("task cancelled during normal Zebra operation: {task_cancelled:?}");
+                if !panic_on_unexpected_termination {
+                    debug!(?task_cancelled, "ignoring expected task termination");
+
+                    task_cancelled
+                } else if is_shutting_down() {
+                    debug!(
+                        ?task_cancelled,
+                        "ignoring task termination because Zebra is shutting down"
+                    );
+
+                    task_cancelled
+                } else {
+                    panic!("task unexpectedly exited with: {:?}", task_cancelled)
+                }
            }
        }
    }
@ -67,7 +74,9 @@ where

    /// Returns a future which waits for `self` to finish, then checks if its output is:
    /// - a panic payload: resume that panic,
-    /// - an unexpected termination: panic with that error,
+    /// - an unexpected termination:
+    ///   - if `panic_on_unexpected_termination` is true, panic with that error,
+    ///   - otherwise, hang waiting for shutdown,
    /// - an expected termination: hang waiting for shutdown.
    ///
    /// Otherwise, returns the task return value of `self`.
@ -79,11 +88,15 @@ where
    /// # Hangs
    ///
    /// If `self` contains an expected termination, and we're shutting down anyway.
+    /// If we're ignoring terminations because `panic_on_unexpected_termination` is `false`.
    /// Futures hang by returning `Pending` and not setting a waker, so this uses minimal resources.
    #[track_caller]
-    fn wait_for_panics(self) -> Self::Output {
+    fn wait_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        async move {
-            match self.await.check_for_panics() {
+            match self
+                .await
+                .check_for_panics_with(panic_on_unexpected_termination)
+            {
                Ok(task_output) => task_output,
                Err(_expected_cancel_error) => future::pending().await,
            }
--- a/zebra-chain/src/diagnostic/task/thread.rs
+++ b/zebra-chain/src/diagnostic/task/thread.rs
@ -8,19 +8,44 @@ use std::{
    thread::{self, JoinHandle},
 };

+use crate::shutdown::is_shutting_down;
+
 use super::{CheckForPanics, WaitForPanics};

-impl<T> CheckForPanics for thread::Result<T> {
+impl<T> CheckForPanics for thread::Result<T>
+where
+    T: std::fmt::Debug,
+{
    type Output = T;

-    /// Panics if the thread panicked.
+    /// # Panics
+    ///
+    /// - if the thread panicked.
+    /// - if the thread is cancelled, `panic_on_unexpected_termination` is true, and
+    ///   Zebra is not shutting down.
    ///
    /// Threads can't be cancelled except by using a panic, so there are no thread errors here.
+    /// `panic_on_unexpected_termination` is
    #[track_caller]
-    fn check_for_panics(self) -> Self::Output {
+    fn check_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        match self {
            // The value returned by the thread when it finished.
-            Ok(thread_output) => thread_output,
+            Ok(thread_output) => {
+                if !panic_on_unexpected_termination {
+                    debug!(?thread_output, "ignoring expected thread exit");
+
+                    thread_output
+                } else if is_shutting_down() {
+                    debug!(
+                        ?thread_output,
+                        "ignoring thread exit because Zebra is shutting down"
+                    );
+
+                    thread_output
+                } else {
+                    panic!("thread unexpectedly exited with: {:?}", thread_output)
+                }
+            }

            // A thread error is always a panic.
            Err(panic_payload) => panic::resume_unwind(panic_payload),
@ -28,17 +53,24 @@ impl<T> CheckForPanics for thread::Result<T> {
    }
 }

-impl<T> WaitForPanics for JoinHandle<T> {
+impl<T> WaitForPanics for JoinHandle<T>
+where
+    T: std::fmt::Debug,
+{
    type Output = T;

    /// Waits for the thread to finish, then panics if the thread panicked.
    #[track_caller]
-    fn wait_for_panics(self) -> Self::Output {
-        self.join().check_for_panics()
+    fn wait_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
+        self.join()
+            .check_for_panics_with(panic_on_unexpected_termination)
    }
 }

-impl<T> WaitForPanics for Arc<JoinHandle<T>> {
+impl<T> WaitForPanics for Arc<JoinHandle<T>>
+where
+    T: std::fmt::Debug,
+{
    type Output = Option<T>;

    /// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
@ -46,7 +78,7 @@ impl<T> WaitForPanics for Arc<JoinHandle<T>> {
    ///
    /// If this is not the final `Arc`, drops the handle and immediately returns `None`.
    #[track_caller]
-    fn wait_for_panics(self) -> Self::Output {
+    fn wait_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        // If we are the last Arc with a reference to this handle,
        // we can wait for it and propagate any panics.
        //
@ -56,14 +88,17 @@ impl<T> WaitForPanics for Arc<JoinHandle<T>> {
        // This is more readable as an expanded statement.
        #[allow(clippy::manual_map)]
        if let Some(handle) = Arc::into_inner(self) {
-            Some(handle.wait_for_panics())
+            Some(handle.wait_for_panics_with(panic_on_unexpected_termination))
        } else {
            None
        }
    }
 }

-impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>> {
+impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>>
+where
+    T: std::fmt::Debug,
+{
    type Output = Option<T>;

    /// If this is the final `Arc`, checks if the thread has finished, then panics if the thread
@ -71,14 +106,14 @@ impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>> {
    ///
    /// If the thread has not finished, or this is not the final `Arc`, returns `None`.
    #[track_caller]
-    fn check_for_panics(self) -> Self::Output {
+    fn check_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        let handle = self.take()?;

        if handle.is_finished() {
            // This is the same as calling `self.wait_for_panics()`, but we can't do that,
            // because we've taken `self`.
            #[allow(clippy::manual_map)]
-            return handle.wait_for_panics();
+            return handle.wait_for_panics_with(panic_on_unexpected_termination);
        }

        *self = Some(handle);
@ -87,7 +122,10 @@ impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>> {
    }
 }

-impl<T> WaitForPanics for &mut Option<Arc<JoinHandle<T>>> {
+impl<T> WaitForPanics for &mut Option<Arc<JoinHandle<T>>>
+where
+    T: std::fmt::Debug,
+{
    type Output = Option<T>;

    /// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
@ -95,10 +133,13 @@ impl<T> WaitForPanics for &mut Option<Arc<JoinHandle<T>>> {
    ///
    /// If this is not the final `Arc`, drops the handle and returns `None`.
    #[track_caller]
-    fn wait_for_panics(self) -> Self::Output {
+    fn wait_for_panics_with(self, panic_on_unexpected_termination: bool) -> Self::Output {
        // This is more readable as an expanded statement.
        #[allow(clippy::manual_map)]
-        if let Some(output) = self.take()?.wait_for_panics() {
+        if let Some(output) = self
+            .take()?
+            .wait_for_panics_with(panic_on_unexpected_termination)
+        {
            Some(output)
        } else {
            // Some other task has a reference, so we should give up ours to let them use it.
--- a/zebra-network/src/peer/client.rs
+++ b/zebra-network/src/peer/client.rs
@ -18,6 +18,8 @@ use futures::{
 use tokio::{sync::broadcast, task::JoinHandle};
 use tower::Service;

+use zebra_chain::diagnostic::task::CheckForPanics;
+
 use crate::{
    peer::{
        error::{AlreadyErrored, ErrorSlot, PeerError, SharedPeerError},
@ -421,8 +423,15 @@ impl MissingInventoryCollector {

 impl Client {
    /// Check if this connection's heartbeat task has exited.
+    ///
+    /// Returns an error if the heartbeat task exited. Otherwise, schedules the client task for
+    /// wakeup when the heartbeat task finishes, or the channel closes, and returns `Pending`.
+    ///
+    /// # Panics
+    ///
+    /// If the heartbeat task panicked.
    #[allow(clippy::unwrap_in_result)]
-    fn check_heartbeat(&mut self, cx: &mut Context<'_>) -> Result<(), SharedPeerError> {
+    fn poll_heartbeat(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), SharedPeerError>> {
        let is_canceled = self
            .shutdown_tx
            .as_mut()
@ -430,17 +439,19 @@ impl Client {
            .poll_canceled(cx)
            .is_ready();

-        if is_canceled {
-            return self.set_task_exited_error(
-                "heartbeat",
-                PeerError::HeartbeatTaskExited("Task was cancelled".to_string()),
-            );
-        }
-
-        match self.heartbeat_task.poll_unpin(cx) {
+        let result = match self.heartbeat_task.poll_unpin(cx) {
            Poll::Pending => {
-                // Heartbeat task is still running.
-                Ok(())
+                // The heartbeat task returns `Pending` while it continues to run.
+                // But if it has dropped its receiver, it is shutting down, and we should also shut down.
+                if is_canceled {
+                    self.set_task_exited_error(
+                        "heartbeat",
+                        PeerError::HeartbeatTaskExited("Task was cancelled".to_string()),
+                    )
+                } else {
+                    // Heartbeat task is still running.
+                    return Poll::Pending;
+                }
            }
            Poll::Ready(Ok(Ok(_))) => {
                // Heartbeat task stopped unexpectedly, without panic or error.
@ -459,6 +470,9 @@ impl Client {
                )
            }
            Poll::Ready(Err(error)) => {
+                // Heartbeat task panicked.
+                let error = error.panic_if_task_has_panicked();
+
                // Heartbeat task was cancelled.
                if error.is_cancelled() {
                    self.set_task_exited_error(
@ -466,11 +480,7 @@ impl Client {
                        PeerError::HeartbeatTaskExited("Task was cancelled".to_string()),
                    )
                }
-                // Heartbeat task stopped with panic.
-                else if error.is_panic() {
-                    panic!("heartbeat task has panicked: {error}");
-                }
-                // Heartbeat task stopped with error.
+                // Heartbeat task stopped with another kind of task error.
                else {
                    self.set_task_exited_error(
                        "heartbeat",
@ -478,25 +488,48 @@ impl Client {
                    )
                }
            }
-        }
+        };
+
+        Poll::Ready(result)
    }

-    /// Check if the connection's task has exited.
-    fn check_connection(&mut self, context: &mut Context<'_>) -> Result<(), SharedPeerError> {
-        match self.connection_task.poll_unpin(context) {
-            Poll::Pending => {
-                // Connection task is still running.
-                Ok(())
-            }
-            Poll::Ready(Ok(())) => {
+    /// Check if the connection's request/response task has exited.
+    ///
+    /// Returns an error if the connection task exited. Otherwise, schedules the client task for
+    /// wakeup when the connection task finishes, and returns `Pending`.
+    ///
+    /// # Panics
+    ///
+    /// If the connection task panicked.
+    fn poll_connection(&mut self, context: &mut Context<'_>) -> Poll<Result<(), SharedPeerError>> {
+        // Return `Pending` if the connection task is still running.
+        let result = match ready!(self.connection_task.poll_unpin(context)) {
+            Ok(()) => {
                // Connection task stopped unexpectedly, without panicking.
                self.set_task_exited_error("connection", PeerError::ConnectionTaskExited)
            }
-            Poll::Ready(Err(error)) => {
-                // Connection task stopped unexpectedly with a panic.
-                panic!("connection task has panicked: {error}");
+            Err(error) => {
+                // Connection task panicked.
+                let error = error.panic_if_task_has_panicked();
+
+                // Connection task was cancelled.
+                if error.is_cancelled() {
+                    self.set_task_exited_error(
+                        "connection",
+                        PeerError::HeartbeatTaskExited("Task was cancelled".to_string()),
+                    )
+                }
+                // Connection task stopped with another kind of task error.
+                else {
+                    self.set_task_exited_error(
+                        "connection",
+                        PeerError::HeartbeatTaskExited(error.to_string()),
+                    )
+                }
            }
-        }
+        };
+
+        Poll::Ready(result)
    }

    /// Properly update the error slot after a background task has unexpectedly stopped.
@ -522,6 +555,9 @@ impl Client {
    }

    /// Poll for space in the shared request sender channel.
+    ///
+    /// Returns an error if the sender channel is closed. If there is no space in the channel,
+    /// returns `Pending`, and schedules the task for wakeup when there is space available.
    fn poll_request(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), SharedPeerError>> {
        let server_result = ready!(self.server_tx.poll_ready(cx));
        if server_result.is_err() {
@ -536,6 +572,33 @@ impl Client {
        }
    }

+    /// Poll for space in the shared request sender channel, and for errors in the connection tasks.
+    ///
+    /// Returns an error if the sender channel is closed, or the heartbeat or connection tasks have
+    /// terminated. If there is no space in the channel, returns `Pending`, and schedules the task
+    /// for wakeup when there is space available, or one of the tasks terminates.
+    fn poll_client(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), SharedPeerError>> {
+        // # Correctness
+        //
+        // The current task must be scheduled for wakeup every time we return
+        // `Poll::Pending`.
+        //
+        // `poll_heartbeat()` and `poll_connection()` schedule the client task for wakeup
+        // if either task exits, or if the heartbeat task drops the cancel handle.
+        //
+        //`ready!` returns `Poll::Pending` when `server_tx` is unready, and
+        // schedules this task for wakeup.
+        //
+        // It's ok to exit early and skip wakeups when there is an error, because the connection
+        // and its tasks are shut down immediately on error.
+
+        let _heartbeat_pending: Poll<()> = self.poll_heartbeat(cx)?;
+        let _connection_pending: Poll<()> = self.poll_connection(cx)?;
+
+        // We're only pending if the sender channel is full.
+        self.poll_request(cx)
+    }
+
    /// Shut down the resources held by the client half of this peer connection.
    ///
    /// Stops further requests to the remote peer, and stops the heartbeat task.
@ -566,20 +629,16 @@ impl Service<Request> for Client {
        // The current task must be scheduled for wakeup every time we return
        // `Poll::Pending`.
        //
-        // `check_heartbeat` and `check_connection` schedule the client task for wakeup
-        // if either task exits, or if the heartbeat task drops the cancel handle.
+        // `poll_client()` schedules the client task for wakeup if the sender channel has space,
+        // either connection task exits, or if the heartbeat task drops the cancel handle.
+
+        // Check all the tasks and channels.
        //
        //`ready!` returns `Poll::Pending` when `server_tx` is unready, and
        // schedules this task for wakeup.
+        let result = ready!(self.poll_client(cx));

-        let mut result = self
-            .check_heartbeat(cx)
-            .and_then(|()| self.check_connection(cx));
-
-        if result.is_ok() {
-            result = ready!(self.poll_request(cx));
-        }
-
+        // Shut down the client and connection if there is an error.
        if let Err(error) = result {
            self.shutdown();

--- a/zebra-network/src/peer/error.rs
+++ b/zebra-network/src/peer/error.rs
@ -82,6 +82,10 @@ pub enum PeerError {
    #[error("Internal services over capacity")]
    Overloaded,

+    /// There are no ready remote peers.
+    #[error("No ready peers available")]
+    NoReadyPeers,
+
    /// This peer request's caused an internal service timeout, so the connection was dropped
    /// to shed load or prevent attacks.
    #[error("Internal services timed out")]
@ -147,6 +151,7 @@ impl PeerError {
            PeerError::Serialization(inner) => format!("Serialization({inner})").into(),
            PeerError::DuplicateHandshake => "DuplicateHandshake".into(),
            PeerError::Overloaded => "Overloaded".into(),
+            PeerError::NoReadyPeers => "NoReadyPeers".into(),
            PeerError::InboundTimeout => "InboundTimeout".into(),
            PeerError::ServiceShutdown => "ServiceShutdown".into(),
            PeerError::NotFoundResponse(_) => "NotFoundResponse".into(),
--- a/zebra-network/src/peer/load_tracked_client.rs
+++ b/zebra-network/src/peer/load_tracked_client.rs
@ -20,6 +20,7 @@ use crate::{
 /// A client service wrapper that keeps track of its load.
 ///
 /// It also keeps track of the peer's reported protocol version.
+#[derive(Debug)]
 pub struct LoadTrackedClient {
    /// A service representing a connected peer, wrapped in a load tracker.
    service: PeakEwma<Client>,
--- a/zebra-network/src/peer_set/inventory_registry.rs
+++ b/zebra-network/src/peer_set/inventory_registry.rs
@ -275,22 +275,30 @@ impl InventoryRegistry {
        self.current.iter().chain(self.prev.iter())
    }

-    /// Returns a future that polls once for new registry updates.
+    /// Returns a future that waits for new registry updates.
    #[allow(dead_code)]
    pub fn update(&mut self) -> Update {
        Update::new(self)
    }

-    /// Drive periodic inventory tasks
+    /// Drive periodic inventory tasks.
    ///
-    /// # Details
+    /// Rotates the inventory HashMaps on every timer tick.
+    /// Drains the inv_stream channel and registers all advertised inventory.
    ///
-    /// - rotates HashMaps based on interval events
-    /// - drains the inv_stream channel and registers all advertised inventory
-    pub fn poll_inventory(&mut self, cx: &mut Context<'_>) -> Result<(), BoxError> {
+    /// Returns an error if the inventory channel is closed.
+    ///
+    /// Otherwise, returns `Ok` if it performed at least one update or rotation, or `Poll::Pending`
+    /// if there was no inventory change. Always registers a wakeup for the next inventory update
+    /// or rotation, even when it returns `Ok`.
+    pub fn poll_inventory(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), BoxError>> {
+        let mut result = Poll::Pending;
+
        // # Correctness
        //
        // Registers the current task for wakeup when the timer next becomes ready.
+        // (But doesn't return, because we also want to register the task for wakeup when more
+        // inventory arrives.)
        //
        // # Security
        //
@ -302,6 +310,7 @@ impl InventoryRegistry {
        // two interval ticks are delayed.
        if Pin::new(&mut self.interval).poll_next(cx).is_ready() {
            self.rotate();
+            result = Poll::Ready(Ok(()));
        }

        // This module uses a broadcast channel instead of an mpsc channel, even
@ -320,24 +329,38 @@ impl InventoryRegistry {
        // rather than propagating it through the peer set's Service::poll_ready
        // implementation, where reporting a failure means reporting a permanent
        // failure of the peer set.
-        while let Poll::Ready(channel_result) = self.inv_stream.next().poll_unpin(cx) {
+
+        // Returns Pending if all messages are processed, but the channel could get more.
+        loop {
+            let channel_result = self.inv_stream.next().poll_unpin(cx);
+
            match channel_result {
-                Some(Ok(change)) => self.register(change),
-                Some(Err(BroadcastStreamRecvError::Lagged(count))) => {
+                Poll::Ready(Some(Ok(change))) => {
+                    self.register(change);
+                    result = Poll::Ready(Ok(()));
+                }
+                Poll::Ready(Some(Err(BroadcastStreamRecvError::Lagged(count)))) => {
+                    // This isn't a fatal inventory error, it's expected behaviour when Zebra is
+                    // under load from peers.
                    metrics::counter!("pool.inventory.dropped", 1);
                    metrics::counter!("pool.inventory.dropped.messages", count);

-                    // If this message happens a lot, we should improve inventory registry performance,
-                    // or poll the registry or peer set in a separate task.
+                    // If this message happens a lot, we should improve inventory registry
+                    // performance, or poll the registry or peer set in a separate task.
                    info!(count, "dropped lagged inventory advertisements");
                }
-                // This indicates all senders, including the one in the handshaker,
-                // have been dropped, which really is a permanent failure.
-                None => return Err(broadcast::error::RecvError::Closed.into()),
+                Poll::Ready(None) => {
+                    // If the channel is empty and returns None, all senders, including the one in
+                    // the handshaker, have been dropped, which really is a permanent failure.
+                    result = Poll::Ready(Err(broadcast::error::RecvError::Closed.into()));
+                }
+                Poll::Pending => {
+                    break;
+                }
            }
        }

-        Ok(())
+        result
    }

    /// Record the given inventory `change` for the peer `addr`.
--- a/zebra-network/src/peer_set/inventory_registry/tests/prop.rs
+++ b/zebra-network/src/peer_set/inventory_registry/tests/prop.rs
@ -1,7 +1,11 @@
 //! Randomised property tests for the inventory registry.

-use std::collections::HashSet;
+use std::{
+    collections::HashSet,
+    task::{Context, Poll},
+};

+use futures::task::noop_waker;
 use proptest::prelude::*;

 use crate::{
@ -81,10 +85,12 @@ async fn inv_registry_inbound_wrapper_with(
        forwarded_msg.expect("unexpected forwarded error result"),
    );

-    inv_registry
-        .update()
-        .await
-        .expect("unexpected dropped registry sender channel");
+    // We don't actually care if the registry takes any action here.
+    let waker = noop_waker();
+    let mut cx = Context::from_waker(&waker);
+    let _poll_pending_or_ok: Poll<()> = inv_registry
+        .poll_inventory(&mut cx)
+        .map(|result| result.expect("unexpected error polling inventory"));

    let test_peer = test_peer
        .get_transient_addr()
--- a/zebra-network/src/peer_set/inventory_registry/update.rs
+++ b/zebra-network/src/peer_set/inventory_registry/update.rs
@ -19,8 +19,11 @@ pub struct Update<'a> {
 impl Unpin for Update<'_> {}

 impl<'a> Update<'a> {
-    #[allow(dead_code)]
-    pub(super) fn new(registry: &'a mut InventoryRegistry) -> Self {
+    /// Returns a new future that returns when the next inventory update or rotation has been
+    /// completed by `registry`.
+    ///
+    /// See [`InventoryRegistry::poll_inventory()`] for details.
+    pub fn new(registry: &'a mut InventoryRegistry) -> Self {
        Self { registry }
    }
 }
@ -28,9 +31,10 @@ impl<'a> Update<'a> {
 impl Future for Update<'_> {
    type Output = Result<(), BoxError>;

+    /// A future that returns when the next inventory update or rotation has been completed.
+    ///
+    /// See [`InventoryRegistry::poll_inventory()`] for details.
    fn poll(mut self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Self::Output> {
-        // TODO: should the future wait until new changes arrive?
-        //       or for the rotation timer?
-        Poll::Ready(self.registry.poll_inventory(cx))
+        self.registry.poll_inventory(cx)
    }
 }
--- a/zebra-network/src/peer_set/set.rs
+++ b/zebra-network/src/peer_set/set.rs
@ -97,7 +97,6 @@ use std::{
    collections::{HashMap, HashSet},
    convert,
    fmt::Debug,
-    future::Future,
    marker::PhantomData,
    net::IpAddr,
    pin::Pin,
@ -107,14 +106,15 @@ use std::{

 use futures::{
    channel::{mpsc, oneshot},
-    future::{FutureExt, TryFutureExt},
+    future::{Future, FutureExt, TryFutureExt},
    prelude::*,
    stream::FuturesUnordered,
+    task::noop_waker,
 };
 use itertools::Itertools;
 use num_integer::div_ceil;
 use tokio::{
-    sync::{broadcast, oneshot::error::TryRecvError, watch},
+    sync::{broadcast, watch},
    task::JoinHandle,
 };
 use tower::{
@ -191,17 +191,6 @@ where

    // Request Routing
    //
-    /// A preselected ready service.
-    ///
-    /// # Correctness
-    ///
-    /// If this is `Some(addr)`, `addr` must be a key for a peer in `ready_services`.
-    /// If that peer is removed from `ready_services`, we must set the preselected peer to `None`.
-    ///
-    /// This is handled by [`PeerSet::take_ready_service`] and
-    /// [`PeerSet::disconnect_from_outdated_peers`].
-    preselected_p2c_peer: Option<D::Key>,
-
    /// Stores gossiped inventory hashes from connected peers.
    ///
    /// Used to route inventory requests to peers that are likely to have it.
@ -265,7 +254,11 @@ where
    C: ChainTip,
 {
    fn drop(&mut self) {
-        self.shut_down_tasks_and_channels()
+        // We don't have access to the current task (if any), so we just drop everything we can.
+        let waker = noop_waker();
+        let mut cx = Context::from_waker(&waker);
+
+        self.shut_down_tasks_and_channels(&mut cx);
    }
 }

@ -310,7 +303,6 @@ where
            // Ready peers
            ready_services: HashMap::new(),
            // Request Routing
-            preselected_p2c_peer: None,
            inventory_registry: InventoryRegistry::new(inv_stream),

            // Busy peers
@ -335,52 +327,51 @@ where

    /// Check background task handles to make sure they're still running.
    ///
+    /// Never returns `Ok`.
+    ///
    /// If any background task exits, shuts down all other background tasks,
-    /// and returns an error.
-    fn poll_background_errors(&mut self, cx: &mut Context) -> Result<(), BoxError> {
-        if let Some(result) = self.receive_tasks_if_needed() {
-            return result;
-        }
+    /// and returns an error. Otherwise, returns `Pending`, and registers a wakeup for
+    /// receiving the background tasks, or the background tasks exiting.
+    fn poll_background_errors(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), BoxError>> {
+        futures::ready!(self.receive_tasks_if_needed(cx))?;

-        match Pin::new(&mut self.guards).poll_next(cx) {
-            // All background tasks are still running.
-            Poll::Pending => Ok(()),
-
-            Poll::Ready(Some(res)) => {
+        // Return Pending if all background tasks are still running.
+        match futures::ready!(Pin::new(&mut self.guards).poll_next(cx)) {
+            Some(res) => {
                info!(
                    background_tasks = %self.guards.len(),
                    "a peer set background task exited, shutting down other peer set tasks"
                );

-                self.shut_down_tasks_and_channels();
+                self.shut_down_tasks_and_channels(cx);

-                // Flatten the join result and inner result,
-                // then turn Ok() task exits into errors.
+                // Flatten the join result and inner result, and return any errors.
                res.map_err(Into::into)
                    // TODO: replace with Result::flatten when it stabilises (#70142)
-                    .and_then(convert::identity)
-                    .and(Err("a peer set background task exited".into()))
+                    .and_then(convert::identity)?;
+
+                // Turn Ok() task exits into errors.
+                Poll::Ready(Err("a peer set background task exited".into()))
            }

-            Poll::Ready(None) => {
-                self.shut_down_tasks_and_channels();
-                Err("all peer set background tasks have exited".into())
+            None => {
+                self.shut_down_tasks_and_channels(cx);
+                Poll::Ready(Err("all peer set background tasks have exited".into()))
            }
        }
    }

-    /// Receive background tasks, if they've been sent on the channel,
-    /// but not consumed yet.
+    /// Receive background tasks, if they've been sent on the channel, but not consumed yet.
    ///
-    /// Returns a result representing the current task state,
-    /// or `None` if the background tasks should be polled to check their state.
-    fn receive_tasks_if_needed(&mut self) -> Option<Result<(), BoxError>> {
+    /// Returns a result representing the current task state, or `Poll::Pending` if the background
+    /// tasks should be polled again to check their state.
+    fn receive_tasks_if_needed(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), BoxError>> {
        if self.guards.is_empty() {
-            match self.handle_rx.try_recv() {
-                // The tasks haven't been sent yet.
-                Err(TryRecvError::Empty) => Some(Ok(())),
+            // Return Pending if the tasks have not been sent yet.
+            let handles = futures::ready!(Pin::new(&mut self.handle_rx).poll(cx));

-                // The tasks have been sent, but not consumed.
+            match handles {
+                // The tasks have been sent, but not consumed yet.
                Ok(handles) => {
                    // Currently, the peer set treats an empty background task set as an error.
                    //
@ -393,21 +384,16 @@ where

                    self.guards.extend(handles);

-                    None
+                    Poll::Ready(Ok(()))
                }

-                // The tasks have been sent and consumed, but then they exited.
-                //
-                // Correctness: the peer set must receive at least one task.
-                //
-                // TODO: refactor `handle_rx` and `guards` into an enum
-                //       for the background task state: Waiting/Running/Shutdown.
-                Err(TryRecvError::Closed) => {
-                    Some(Err("all peer set background tasks have exited".into()))
-                }
+                // The sender was dropped without sending the tasks.
+                Err(_) => Poll::Ready(Err(
+                    "sender did not send peer background tasks before it was dropped".into(),
+                )),
            }
        } else {
-            None
+            Poll::Ready(Ok(()))
        }
    }

@ -415,9 +401,8 @@ where
    /// - services by dropping the service lists
    /// - background tasks via their join handles or cancel handles
    /// - channels by closing the channel
-    fn shut_down_tasks_and_channels(&mut self) {
+    fn shut_down_tasks_and_channels(&mut self, cx: &mut Context<'_>) {
        // Drop services and cancel their background tasks.
-        self.preselected_p2c_peer = None;
        self.ready_services = HashMap::new();

        for (_peer_key, handle) in self.cancel_handles.drain() {
@ -429,9 +414,9 @@ where
        // so we don't add more peers to a shut down peer set.
        self.demand_signal.close_channel();

-        // Shut down background tasks.
+        // Shut down background tasks, ignoring pending polls.
        self.handle_rx.close();
-        self.receive_tasks_if_needed();
+        let _ = self.receive_tasks_if_needed(cx);
        for guard in self.guards.iter() {
            guard.abort();
        }
@ -441,25 +426,63 @@ where
    ///
    /// Move newly ready services to the ready list if they are for peers with supported protocol
    /// versions, otherwise they are dropped. Also drop failed services.
-    fn poll_unready(&mut self, cx: &mut Context<'_>) {
+    ///
+    /// Never returns an error.
+    ///
+    /// Returns `Ok(Some(())` if at least one peer became ready, `Poll::Pending` if there are
+    /// unready peers, but none became ready, and `Ok(None)` if the unready peers were empty.
+    ///
+    /// If there are any remaining unready peers, registers a wakeup for the next time one becomes
+    /// ready. If there are no unready peers, doesn't register any wakeups. (Since wakeups come
+    /// from peers, there needs to be at least one peer to register a wakeup.)
+    fn poll_unready(&mut self, cx: &mut Context<'_>) -> Poll<Result<Option<()>, BoxError>> {
+        let mut result = Poll::Pending;
+
+        // # Correctness
+        //
+        // `poll_next()` must always be called, because `self.unready_services` could have been
+        // empty before the call to `self.poll_ready()`.
+        //
+        // > When new futures are added, `poll_next` must be called in order to begin receiving
+        // > wake-ups for new futures.
+        //
+        // <https://docs.rs/futures/latest/futures/stream/futures_unordered/struct.FuturesUnordered.html>
+        //
+        // Returns Pending if we've finished processing the unready service changes,
+        // but there are still some unready services.
        loop {
-            match Pin::new(&mut self.unready_services).poll_next(cx) {
-                // No unready service changes, or empty unready services
-                Poll::Pending | Poll::Ready(None) => return,
+            // No ready peers left, but there are some unready peers pending.
+            let Poll::Ready(ready_peer) = Pin::new(&mut self.unready_services).poll_next(cx) else {
+                break;
+            };
+
+            match ready_peer {
+                // No unready peers in the list.
+                None => {
+                    // If we've finished processing the unready service changes, and there are no
+                    // unready services left, it doesn't make sense to return Pending, because
+                    // their stream is terminated. But when we add more unready peers and call
+                    // `poll_next()`, its termination status will be reset, and it will receive
+                    // wakeups again.
+                    if result.is_pending() {
+                        result = Poll::Ready(Ok(None));
+                    }
+
+                    break;
+                }

                // Unready -> Ready
-                Poll::Ready(Some(Ok((key, svc)))) => {
+                Some(Ok((key, svc))) => {
                    trace!(?key, "service became ready");
-                    let cancel = self.cancel_handles.remove(&key);
-                    assert!(cancel.is_some(), "missing cancel handle");

-                    if svc.remote_version() >= self.minimum_peer_version.current() {
-                        self.ready_services.insert(key, svc);
-                    }
+                    self.push_ready(true, key, svc);
+
+                    // Return Ok if at least one peer became ready.
+                    result = Poll::Ready(Ok(Some(())));
                }

                // Unready -> Canceled
-                Poll::Ready(Some(Err((key, UnreadyError::Canceled)))) => {
+                Some(Err((key, UnreadyError::Canceled))) => {
                    // A service be canceled because we've connected to the same service twice.
                    // In that case, there is a cancel handle for the peer address,
                    // but it belongs to the service for the newer connection.
@ -469,7 +492,7 @@ where
                        "service was canceled, dropping service"
                    );
                }
-                Poll::Ready(Some(Err((key, UnreadyError::CancelHandleDropped(_))))) => {
+                Some(Err((key, UnreadyError::CancelHandleDropped(_)))) => {
                    // Similarly, services with dropped cancel handes can have duplicates.
                    trace!(
                        ?key,
@ -479,7 +502,7 @@ where
                }

                // Unready -> Errored
-                Poll::Ready(Some(Err((key, UnreadyError::Inner(error))))) => {
+                Some(Err((key, UnreadyError::Inner(error)))) => {
                    debug!(%error, "service failed while unready, dropping service");

                    let cancel = self.cancel_handles.remove(&key);
@ -487,6 +510,57 @@ where
                }
            }
        }
+
+        result
+    }
+
+    /// Checks previously ready peer services for errors.
+    ///
+    /// The only way these peer `Client`s can become unready is when we send them a request,
+    /// because the peer set has exclusive access to send requests to each peer. (If an inbound
+    /// request is in progress, it will be handled, then our request will be sent by the connection
+    /// task.)
+    ///
+    /// Returns `Poll::Ready` if there are some ready peers, and `Poll::Pending` if there are no
+    /// ready peers. Registers a wakeup if any peer has failed due to a disconnection, hang, or protocol error.
+    ///
+    /// # Panics
+    ///
+    /// If any peers somehow became unready without being sent a request. This indicates a bug in the peer set, where requests
+    /// are sent to peers without putting them in `unready_peers`.
+    fn poll_ready_peer_errors(&mut self, cx: &mut Context<'_>) -> Poll<()> {
+        let mut previous = HashMap::new();
+        std::mem::swap(&mut previous, &mut self.ready_services);
+
+        // TODO: consider only checking some peers each poll (for performance reasons),
+        //       but make sure we eventually check all of them.
+        for (key, mut svc) in previous.drain() {
+            let Poll::Ready(peer_readiness) = Pin::new(&mut svc).poll_ready(cx) else {
+                unreachable!(
+                    "unexpected unready peer: peers must be put into the unready_peers list \
+                     after sending them a request"
+                );
+            };
+
+            match peer_readiness {
+                // Still ready, add it back to the list.
+                Ok(()) => self.push_ready(false, key, svc),
+
+                // Ready -> Errored
+                Err(error) => {
+                    debug!(%error, "service failed while ready, dropping service");
+
+                    // Ready services can just be dropped, they don't need any cleanup.
+                    std::mem::drop(svc);
+                }
+            }
+        }
+
+        if self.ready_services.is_empty() {
+            Poll::Pending
+        } else {
+            Poll::Ready(())
+        }
    }

    /// Returns the number of peer connections Zebra already has with
@ -509,17 +583,35 @@ where
        self.ready_services.contains_key(&addr) || self.cancel_handles.contains_key(&addr)
    }

-    /// Checks for newly inserted or removed services.
+    /// Processes the entire list of newly inserted or removed services.
    ///
    /// Puts inserted services in the unready list.
    /// Drops removed services, after cancelling any pending requests.
+    ///
+    /// If the peer connector channel is closed, returns an error.
+    ///
+    /// Otherwise, returns `Ok` if it discovered at least one peer, or `Poll::Pending` if it didn't
+    /// discover any peers. Always registers a wakeup for new peers, even when it returns `Ok`.
    fn poll_discover(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), BoxError>> {
-        use futures::ready;
+        // Return pending if there are no peers in the list.
+        let mut result = Poll::Pending;
+
        loop {
-            match ready!(Pin::new(&mut self.discover).poll_discover(cx))
+            // If we've emptied the list, finish looping, otherwise process the new peer.
+            let Poll::Ready(discovered) = Pin::new(&mut self.discover).poll_discover(cx) else {
+                break;
+            };
+
+            // If the change channel has a permanent error, return that error.
+            let change = discovered
                .ok_or("discovery stream closed")?
-                .map_err(Into::into)?
-            {
+                .map_err(Into::into)?;
+
+            // Otherwise we have successfully processed a peer.
+            result = Poll::Ready(Ok(()));
+
+            // Process each change.
+            match change {
                Change::Remove(key) => {
                    trace!(?key, "got Change::Remove from Discover");
                    self.remove(&key);
@ -552,32 +644,22 @@ where
                }
            }
        }
+
+        result
    }

    /// Checks if the minimum peer version has changed, and disconnects from outdated peers.
    fn disconnect_from_outdated_peers(&mut self) {
        if let Some(minimum_version) = self.minimum_peer_version.changed() {
-            self.ready_services.retain(|address, peer| {
-                if peer.remote_version() >= minimum_version {
-                    true
-                } else {
-                    if self.preselected_p2c_peer == Some(*address) {
-                        self.preselected_p2c_peer = None;
-                    }
-
-                    false
-                }
-            });
+            // It is ok to drop ready services, they don't need anything cancelled.
+            self.ready_services
+                .retain(|_address, peer| peer.remote_version() >= minimum_version);
        }
    }

-    /// Takes a ready service by key, invalidating `preselected_p2c_peer` if needed.
+    /// Takes a ready service by key.
    fn take_ready_service(&mut self, key: &D::Key) -> Option<D::Service> {
        if let Some(svc) = self.ready_services.remove(key) {
-            if Some(*key) == self.preselected_p2c_peer {
-                self.preselected_p2c_peer = None;
-            }
-
            assert!(
                !self.cancel_handles.contains_key(key),
                "cancel handles are only used for unready service work"
@ -605,6 +687,25 @@ where
        }
    }

+    /// Adds a ready service to the ready list if it's for a peer with a supported version.
+    /// If `was_unready` is true, also removes the peer's cancel handle.
+    ///
+    /// If the service is for a connection to an outdated peer, the service is dropped.
+    fn push_ready(&mut self, was_unready: bool, key: D::Key, svc: D::Service) {
+        let cancel = self.cancel_handles.remove(&key);
+        assert_eq!(
+            cancel.is_some(),
+            was_unready,
+            "missing or unexpected cancel handle"
+        );
+
+        if svc.remote_version() >= self.minimum_peer_version.current() {
+            self.ready_services.insert(key, svc);
+        } else {
+            std::mem::drop(svc);
+        }
+    }
+
    /// Adds a busy service to the unready list if it's for a peer with a supported version,
    /// and adds a cancel handle for the service's current request.
    ///
@ -631,7 +732,7 @@ where
    }

    /// Performs P2C on `self.ready_services` to randomly select a less-loaded ready service.
-    fn preselect_p2c_peer(&self) -> Option<D::Key> {
+    fn select_ready_p2c_peer(&self) -> Option<D::Key> {
        self.select_p2c_peer_from_list(&self.ready_services.keys().copied().collect())
    }

@ -647,8 +748,7 @@ where
                    .expect("just checked there is one service"),
            ),
            len => {
-                // If there are only 2 peers, randomise their order.
-                // Otherwise, choose 2 random peers in a random order.
+                // Choose 2 random peers, then return the least loaded of those 2 peers.
                let (a, b) = {
                    let idxs = rand::seq::index::sample(&mut rand::thread_rng(), len, 2);
                    let a = idxs.index(0);
@ -708,19 +808,32 @@ where

    /// Routes a request using P2C load-balancing.
    fn route_p2c(&mut self, req: Request) -> <Self as tower::Service<Request>>::Future {
-        let preselected_key = self
-            .preselected_p2c_peer
-            .expect("ready peer service must have a preselected peer");
+        if let Some(p2c_key) = self.select_ready_p2c_peer() {
+            tracing::trace!(?p2c_key, "routing based on p2c");

-        tracing::trace!(?preselected_key, "routing based on p2c");
+            let mut svc = self
+                .take_ready_service(&p2c_key)
+                .expect("selected peer must be ready");

-        let mut svc = self
-            .take_ready_service(&preselected_key)
-            .expect("ready peer set must have preselected a ready peer");
+            let fut = svc.call(req);
+            self.push_unready(p2c_key, svc);

-        let fut = svc.call(req);
-        self.push_unready(preselected_key, svc);
-        fut.map_err(Into::into).boxed()
+            return fut.map_err(Into::into).boxed();
+        }
+
+        async move {
+            // Let other tasks run, so a retry request might get different ready peers.
+            tokio::task::yield_now().await;
+
+            // # Security
+            //
+            // Avoid routing requests to peers that are missing inventory.
+            // If we kept trying doomed requests, peers that are missing our requested inventory
+            // could take up a large amount of our bandwidth and retry limits.
+            Err(SharedPeerError::from(PeerError::NoReadyPeers))
+        }
+        .map_err(Into::into)
+        .boxed()
    }

    /// Tries to route a request to a ready peer that advertised that inventory,
@ -995,78 +1108,71 @@ where
        Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send + 'static>>;

    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
-        self.poll_background_errors(cx)?;
+        // Update service and peer statuses.
+        //
+        // # Correctness
+        //
+        // All of the futures that receive a context from this method can wake the peer set buffer
+        // task. If there are no ready peers, and no new peers, network requests will pause until:
+        // - an unready peer becomes ready, or
+        // - a new peer arrives.

-        // Update peer statuses
-        let _ = self.poll_discover(cx)?;
+        // Check for new peers, and register a task wakeup when the next new peers arrive. New peers
+        // can be infrequent if our connection slots are full, or we're connected to all
+        // available/useful peers.
+        let _poll_pending_or_ready: Poll<()> = self.poll_discover(cx)?;
+
+        // These tasks don't provide new peers or newly ready peers.
+        let _poll_pending: Poll<()> = self.poll_background_errors(cx)?;
+        let _poll_pending_or_ready: Poll<()> = self.inventory_registry.poll_inventory(cx)?;
+
+        // Check for newly ready peers, including newly added peers (which are added as unready).
+        // So it needs to run after `poll_discover()`. Registers a wakeup if there are any unready
+        // peers.
+        //
+        // Each connected peer should become ready within a few minutes, or timeout, close the
+        // connection, and release its connection slot.
+        //
+        // TODO: drop peers that overload us with inbound messages and never become ready (#7822)
+        let _poll_pending_or_ready: Poll<Option<()>> = self.poll_unready(cx)?;
+
+        // Cleanup and metrics.
+
+        // Only checks the versions of ready peers, so it needs to run after `poll_unready()`.
        self.disconnect_from_outdated_peers();
-        self.inventory_registry.poll_inventory(cx)?;
-        self.poll_unready(cx);

+        // These metrics should run last, to report the most up-to-date information.
        self.log_peer_set_size();
        self.update_metrics();

-        loop {
-            // Re-check that the pre-selected service is ready, in case
-            // something has happened since (e.g., it failed, peer closed
-            // connection, ...)
-            if let Some(key) = self.preselected_p2c_peer {
-                trace!(preselected_key = ?key);
-                let mut service = self
-                    .take_ready_service(&key)
-                    .expect("preselected peer must be in the ready list");
-                match service.poll_ready(cx) {
-                    Poll::Ready(Ok(())) => {
-                        trace!("preselected service is still ready, keeping it selected");
-                        self.preselected_p2c_peer = Some(key);
-                        self.ready_services.insert(key, service);
-                        return Poll::Ready(Ok(()));
-                    }
-                    Poll::Pending => {
-                        trace!("preselected service is no longer ready, moving to unready list");
-                        self.push_unready(key, service);
-                    }
-                    Poll::Ready(Err(error)) => {
-                        trace!(%error, "preselected service failed, dropping it");
-                        std::mem::drop(service);
-                    }
-                }
-            }
+        // Check for failures in ready peers, removing newly errored or disconnected peers.
+        // So it needs to run after `poll_unready()`.
+        let ready_peers: Poll<()> = self.poll_ready_peer_errors(cx);

-            trace!("preselected service was not ready, preselecting another ready service");
-            self.preselected_p2c_peer = self.preselect_p2c_peer();
-            self.update_metrics();
+        if ready_peers.is_pending() {
+            // # Correctness
+            //
+            // If the channel is full, drop the demand signal rather than waiting. If we waited
+            // here, the crawler could deadlock sending a request to fetch more peers, because it
+            // also empties the channel.
+            trace!("no ready services, sending demand signal");
+            let _ = self.demand_signal.try_send(MorePeers);

-            if self.preselected_p2c_peer.is_none() {
-                // CORRECTNESS
-                //
-                // If the channel is full, drop the demand signal rather than waiting.
-                // If we waited here, the crawler could deadlock sending a request to
-                // fetch more peers, because it also empties the channel.
-                trace!("no ready services, sending demand signal");
-                let _ = self.demand_signal.try_send(MorePeers);
-
-                // CORRECTNESS
-                //
-                // The current task must be scheduled for wakeup every time we
-                // return `Poll::Pending`.
-                //
-                // As long as there are unready or new peers, this task will run,
-                // because:
-                // - `poll_discover` schedules this task for wakeup when new
-                //   peers arrive.
-                // - if there are unready peers, `poll_unready` schedules this
-                //   task for wakeup when peer services become ready.
-                // - if the preselected peer is not ready, `service.poll_ready`
-                //   schedules this task for wakeup when that service becomes
-                //   ready.
-                //
-                // To avoid peers blocking on a full background error channel:
-                // - if no background tasks have exited since the last poll,
-                //   `poll_background_errors` schedules this task for wakeup when
-                //   the next task exits.
-                return Poll::Pending;
-            }
+            // # Correctness
+            //
+            // The current task must be scheduled for wakeup every time we return `Poll::Pending`.
+            //
+            // As long as there are unready or new peers, this task will run, because:
+            // - `poll_discover` schedules this task for wakeup when new peers arrive.
+            // - if there are unready peers, `poll_unready` or `poll_ready_peers` schedule this
+            //   task for wakeup when peer services become ready.
+            //
+            // To avoid peers blocking on a full peer status/error channel:
+            // - `poll_background_errors` schedules this task for wakeup when the peer status
+            //   update task exits.
+            Poll::Pending
+        } else {
+            Poll::Ready(Ok(()))
        }
    }

--- a/zebra-network/src/peer_set/set/tests/vectors.rs
+++ b/zebra-network/src/peer_set/set/tests/vectors.rs
@ -10,7 +10,6 @@ use zebra_chain::{
    parameters::{Network, NetworkUpgrade},
 };

-use super::{PeerSetBuilder, PeerVersions};
 use crate::{
    constants::DEFAULT_MAX_CONNS_PER_IP,
    peer::{ClientRequest, MinimumPeerVersion},
@ -19,6 +18,8 @@ use crate::{
    Request, SharedPeerError,
 };

+use super::{PeerSetBuilder, PeerVersions};
+
 #[test]
 fn peer_set_ready_single_connection() {
    // We are going to use just one peer version in this test
--- a/zebra-state/src/service/finalized_state/disk_format/upgrade.rs
+++ b/zebra-state/src/service/finalized_state/disk_format/upgrade.rs
@ -883,7 +883,7 @@ impl DbFormatChangeThreadHandle {
    ///
    /// This method should be called regularly, so that panics are detected as soon as possible.
    pub fn check_for_panics(&mut self) {
-        self.update_task.check_for_panics();
+        self.update_task.panic_if_task_has_panicked();
    }

    /// Wait for the spawned thread to finish. If it exited with a panic, resume that panic.