fix(network): Reconnect with peers after brief network interruption (#7853)

* Fixes bug where Zebra won't reconnect to peers after brief loss of network connectivity * only dial on timercrawl when theres a new address or zero active outbound conns
2023-10-27 02:13:16 -04:00 · 2023-10-27 02:13:16 -04:00 · 5367ccbc5c
parent 0a3790b73e
commit 5367ccbc5c
1 changed files with 6 additions and 4 deletions
--- a/zebra-network/src/peer_set/initialize.rs
+++ b/zebra-network/src/peer_set/initialize.rs
@ -896,7 +896,7 @@ where
                            // There weren't any peers, so try to get more peers.
                            debug!("demand for peers but no available candidates");

-                            crawl(candidates, demand_tx).await?;
+                            crawl(candidates, demand_tx, false).await?;

                            Ok(DemandCrawlFinished)
                        }
@ -910,6 +910,7 @@ where
            Ok(TimerCrawl { tick }) => {
                let candidates = candidates.clone();
                let demand_tx = demand_tx.clone();
+                let should_always_dial = active_outbound_connections.update_count() == 0;

                let crawl_handle = tokio::spawn(
                    async move {
@ -918,7 +919,7 @@ where
                            "crawling for more peers in response to the crawl timer"
                        );

-                        crawl(candidates, demand_tx).await?;
+                        crawl(candidates, demand_tx, should_always_dial).await?;

                        Ok(TimerCrawlFinished)
                    }
@ -957,11 +958,12 @@ where
 }

 /// Try to get more peers using `candidates`, then queue a connection attempt using `demand_tx`.
-/// If there were no new peers, the connection attempt is skipped.
+/// If there were no new peers and `should_always_dial` is false, the connection attempt is skipped.
 #[instrument(skip(candidates, demand_tx))]
 async fn crawl<S>(
    candidates: Arc<futures::lock::Mutex<CandidateSet<S>>>,
    mut demand_tx: futures::channel::mpsc::Sender<MorePeers>,
+    should_always_dial: bool,
 ) -> Result<(), BoxError>
 where
    S: Service<Request, Response = Response, Error = BoxError> + Send + Sync + 'static,
@ -976,7 +978,7 @@ where
        result
    };
    let more_peers = match result {
-        Ok(more_peers) => more_peers,
+        Ok(more_peers) => more_peers.or_else(|| should_always_dial.then_some(MorePeers)),
        Err(e) => {
            info!(
                ?e,