change(state): Write finalized blocks to the state in a separate thread, to avoid network and RPC hangs (#5134)

* Add a new block commit task and channels, that don't do anything yet * Add last_block_hash_sent to the state service, to avoid database accesses * Update last_block_hash_sent regardless of commit errors * Rename a field to StateService.max_queued_finalized_height * Commit finalized blocks to the state in a separate task * Check for panics in the block write task * Wait for the block commit task in tests, and check for errors * Always run a proptest that sleeps once * Add extra debugging to state shutdowns * Work around a RocksDB shutdown bug * Close the finalized block channel when we're finished with it * Only reset state queue once per error * Update some TODOs * Add a module doc comment * Drop channels and check for closed channels in the block commit task * Close state channels and tasks on drop * Remove some duplicate fields across StateService and ReadStateService * Try tweaking the shutdown steps * Update and clarify some comments * Clarify another comment * Don't try to cancel RocksDB background work on drop * Fix up some comments * Remove some duplicate code * Remove redundant workarounds for shutdown issues * Remode a redundant channel close in the block commit task * Remove a mistaken `!force` shutdown condition * Remove duplicate force-shutdown code and explain it better * Improve RPC error logging * Wait for chain tip updates in the RPC tests * Wait 2 seconds for chain tip updates before skipping them * Remove an unnecessary block_in_place() * Fix some test error messages that were changed by earlier fixes * Expand some comments, fix typos Co-authored-by: Marek <mail@marek.onl> * Actually drop children of failed blocks * Explain why we drop descendants of failed blocks * Clarify a comment * Wait for chain tip updates in a failing test on macOS * Clean duplicate finalized blocks when the non-finalized state activates * Send an error when receiving a duplicate finalized block * Update checkpoint block behaviour, document its consensus rule * Wait for chain tip changes in inbound_block_height_lookahead_limit test * Wait for the genesis block to commit in the fake peer set mempool tests * Disable unreliable mempool verification check in the send transaction test * Appease rustfmt * Use clear_finalized_block_queue() everywhere that blocks are dropped * Document how Finalized and NonFinalized clones are different * Use the same check as commit_finalized() for finalized block heights Co-authored-by: Marek <mail@marek.onl> Co-authored-by: Marek <mail@marek.onl>
2022-09-29 02:09:56 +10:00 · 2022-09-29 02:09:56 +10:00 · 343c5e68d4
parent 55e5a13fc8
commit 343c5e68d4
25 changed files with 929 additions and 284 deletions
--- a/zebra-rpc/src/methods.rs
+++ b/zebra-rpc/src/methods.rs
@ -872,7 +872,7 @@ where
                    hashes
                        .iter()
                        .map(|(tx_loc, tx_id)| {
-                            // TODO: downgrade to debug, because there's nothing the user can do
+                            // Check that the returned transactions are in chain order.
                            assert!(
                                *tx_loc > last_tx_location,
                                "Transactions were not in chain order:\n\
@ -931,7 +931,7 @@ where
                let satoshis = u64::from(utxo_data.3.value);

                let output_location = *utxo_data.2;
-                // TODO: downgrade to debug, because there's nothing the user can do
+                // Check that the returned UTXOs are in chain order.
                assert!(
                    output_location > last_output_location,
                    "UTXOs were not in chain order:\n\
@ -1272,17 +1272,19 @@ impl GetRawTransaction {
 /// Check if provided height range is valid for address indexes.
 fn check_height_range(start: Height, end: Height, chain_height: Height) -> Result<()> {
    if start == Height(0) || end == Height(0) {
-        return Err(Error::invalid_params(
-            "Start and end are expected to be greater than zero",
-        ));
+        return Err(Error::invalid_params(format!(
+            "start {start:?} and end {end:?} must both be greater than zero"
+        )));
    }
-    if end < start {
-        return Err(Error::invalid_params(
-            "End value is expected to be greater than or equal to start",
-        ));
+    if start > end {
+        return Err(Error::invalid_params(format!(
+            "start {start:?} must be less than or equal to end {end:?}"
+        )));
    }
    if start > chain_height || end > chain_height {
-        return Err(Error::invalid_params("Start or end is outside chain range"));
+        return Err(Error::invalid_params(format!(
+            "start {start:?} and end {end:?} must both be less than or equal to the chain tip {chain_height:?}"
+        )));
    }

    Ok(())
--- a/zebra-rpc/src/methods/tests/vectors.rs
+++ b/zebra-rpc/src/methods/tests/vectors.rs
@ -395,7 +395,7 @@ async fn rpc_getaddresstxids_invalid_arguments() {
        .unwrap_err();
    assert_eq!(
        error.message,
-        "End value is expected to be greater than or equal to start".to_string()
+        "start Height(2) must be less than or equal to end Height(1)".to_string()
    );

    // call the method with start equal zero
@ -411,7 +411,7 @@ async fn rpc_getaddresstxids_invalid_arguments() {
        .unwrap_err();
    assert_eq!(
        error.message,
-        "Start and end are expected to be greater than zero".to_string()
+        "start Height(0) and end Height(1) must both be greater than zero".to_string()
    );

    // call the method outside the chain tip height
@ -427,7 +427,7 @@ async fn rpc_getaddresstxids_invalid_arguments() {
        .unwrap_err();
    assert_eq!(
        error.message,
-        "Start or end is outside chain range".to_string()
+        "start Height(1) and end Height(11) must both be less than or equal to the chain tip Height(10)".to_string()
    );

    mempool.expect_no_requests().await;
--- a/zebra-state/src/arbitrary.rs
+++ b/zebra-state/src/arbitrary.rs
@ -17,6 +17,8 @@ use crate::{

 /// Mocks computation done during semantic validation
 pub trait Prepare {
+    /// Runs block semantic validation computation, and returns the result.
+    /// Test-only method.
    fn prepare(self) -> PreparedBlock;
 }

--- a/zebra-state/src/lib.rs
+++ b/zebra-state/src/lib.rs
@ -16,7 +16,8 @@
 extern crate tracing;

 #[cfg(any(test, feature = "proptest-impl"))]
-mod arbitrary;
+pub mod arbitrary;
+
 mod config;
 pub mod constants;
 mod error;
@ -39,7 +40,7 @@ pub use service::{

 #[cfg(any(test, feature = "proptest-impl"))]
 pub use service::{
-    arbitrary::populated_state,
+    arbitrary::{populated_state, CHAIN_TIP_UPDATE_WAIT_LIMIT},
    chain_tip::{ChainTipBlock, ChainTipSender},
    init_test, init_test_services,
 };
--- a/zebra-state/src/request.rs
+++ b/zebra-state/src/request.rs
@ -381,20 +381,44 @@ pub enum Request {
    /// documentation for details.
    CommitBlock(PreparedBlock),

-    /// Commit a finalized block to the state, skipping all validation.
+    /// Commit a checkpointed block to the state, skipping most block validation.
    ///
    /// This is exposed for use in checkpointing, which produces finalized
    /// blocks. It is the caller's responsibility to ensure that the block is
-    /// valid and final. This request can be made out-of-order; the state service
-    /// will queue it until its parent is ready.
+    /// semantically valid and final. This request can be made out-of-order;
+    /// the state service will queue it until its parent is ready.
    ///
    /// Returns [`Response::Committed`] with the hash of the newly committed
    /// block, or an error.
    ///
    /// This request cannot be cancelled once submitted; dropping the response
    /// future will have no effect on whether it is eventually processed.
-    /// Duplicate requests should not be made, because it is the caller's
-    /// responsibility to ensure that each block is valid and final.
+    /// Duplicate requests will replace the older duplicate, and return an error
+    /// in its response future.
+    ///
+    /// # Note
+    ///
+    /// Finalized and non-finalized blocks are an internal Zebra implementation detail.
+    /// There is no difference between these blocks on the network, or in Zebra's
+    /// network or syncer implementations.
+    ///
+    /// # Consensus
+    ///
+    /// Checkpointing is allowed under the Zcash "social consensus" rules.
+    /// Zebra checkpoints both settled network upgrades, and blocks past the rollback limit.
+    /// (By the time Zebra release is tagged, its final checkpoint is typically hours or days old.)
+    ///
+    /// > A network upgrade is settled on a given network when there is a social consensus
+    /// > that it has activated with a given activation block hash. A full validator that
+    /// > potentially risks Mainnet funds or displays Mainnet transaction information to a user
+    /// > MUST do so only for a block chain that includes the activation block of the most
+    /// > recent settled network upgrade, with the corresponding activation block hash.
+    /// > ...
+    /// > A full validator MAY impose a limit on the number of blocks it will “roll back”
+    /// > when switching from one best valid block chain to another that is not a descendent.
+    /// > For `zcashd` and `zebra` this limit is 100 blocks.
+    ///
+    /// <https://zips.z.cash/protocol/protocol.pdf#blockchain>
    ///
    /// # Correctness
    ///
--- a/zebra-state/src/service.rs
+++ b/zebra-state/src/service.rs
@ -19,6 +19,7 @@ use std::{
    convert,
    future::Future,
    pin::Pin,
+    sync::{Arc, Mutex},
    task::{Context, Poll},
    time::{Duration, Instant},
 };
@ -65,6 +66,7 @@ mod non_finalized_state;
 mod pending_utxos;
 mod queued_blocks;
 pub(crate) mod read;
+mod write;

 #[cfg(any(test, feature = "proptest-impl"))]
 pub mod arbitrary;
@ -74,7 +76,7 @@ mod tests;

 pub use finalized_state::{OutputIndex, OutputLocation, TransactionLocation};

-use self::queued_blocks::QueuedFinalized;
+use self::queued_blocks::{QueuedFinalized, QueuedNonFinalized};

 /// A read-write service for Zebra's cached blockchain state.
 ///
@ -126,6 +128,43 @@ pub(crate) struct StateService {
    //       and block write task share ownership of the database.
    pub(crate) disk: FinalizedState,

+    /// A channel to send blocks to the `block_write_task`,
+    /// so they can be written to the [`NonFinalizedState`].
+    //
+    // TODO: actually send blocks on this channel
+    non_finalized_block_write_sender:
+        Option<tokio::sync::mpsc::UnboundedSender<QueuedNonFinalized>>,
+
+    /// A channel to send blocks to the `block_write_task`,
+    /// so they can be written to the [`FinalizedState`].
+    ///
+    /// This sender is dropped after the state has finished sending all the checkpointed blocks,
+    /// and the lowest non-finalized block arrives.
+    finalized_block_write_sender: Option<tokio::sync::mpsc::UnboundedSender<QueuedFinalized>>,
+
+    /// The [`block::Hash`] of the most recent block sent on
+    /// `finalized_block_write_sender` or `non_finalized_block_write_sender`.
+    ///
+    /// On startup, this is:
+    /// - the finalized tip, if there are stored blocks, or
+    /// - the genesis block's parent hash, if the database is empty.
+    ///
+    /// If `invalid_block_reset_receiver` gets a reset, this is:
+    /// - the hash of the last valid committed block (the parent of the invalid block).
+    //
+    // TODO:
+    // - turn this into an IndexMap containing recent non-finalized block hashes and heights
+    //   (they are all potential tips)
+    // - remove block hashes once their heights are strictly less than the finalized tip
+    last_block_hash_sent: block::Hash,
+
+    /// If an invalid block is sent on `finalized_block_write_sender`
+    /// or `non_finalized_block_write_sender`,
+    /// this channel gets the [`block::Hash`] of the valid tip.
+    //
+    // TODO: add tests for finalized and non-finalized resets (#2654)
+    invalid_block_reset_receiver: tokio::sync::mpsc::UnboundedReceiver<block::Hash>,
+
    // Pending UTXO Request Tracking
    //
    /// The set of outpoints with pending requests for their associated transparent::Output.
@ -134,15 +173,19 @@ pub(crate) struct StateService {
    /// Instant tracking the last time `pending_utxos` was pruned.
    last_prune: Instant,

-    // Concurrently Readable State
+    // Updating Concurrently Readable State
    //
    /// A sender channel used to update the current best chain tip for
    /// [`LatestChainTip`] and [`ChainTipChange`].
-    chain_tip_sender: ChainTipSender,
+    //
+    // TODO: remove this copy of the chain tip sender, and get rid of the mutex in the block write task
+    chain_tip_sender: Arc<Mutex<ChainTipSender>>,

    /// A sender channel used to update the recent non-finalized state for the [`ReadStateService`].
    non_finalized_state_sender: watch::Sender<NonFinalizedState>,

+    // Concurrently Readable State
+    //
    /// A cloneable [`ReadStateService`], used to answer concurrent read requests.
    ///
    /// TODO: move users of read [`Request`]s to [`ReadStateService`], and remove `read_service`.
@ -154,7 +197,9 @@ pub(crate) struct StateService {
    ///
    /// Set to `f64::NAN` if `queued_finalized_blocks` is empty, because grafana shows NaNs
    /// as a break in the graph.
-    max_queued_height: f64,
+    //
+    // TODO: add a similar metric for `queued_non_finalized_blocks`
+    max_queued_finalized_height: f64,
 }

 /// A read-only service for accessing Zebra's cached blockchain state.
@ -177,7 +222,7 @@ pub struct ReadStateService {

    // Shared Concurrently Readable State
    //
-    /// A watch channel for a recent [`NonFinalizedState`].
+    /// A watch channel with a cached copy of the [`NonFinalizedState`].
    ///
    /// This state is only updated between requests,
    /// so it might include some block data that is also on `disk`.
@ -191,6 +236,63 @@ pub struct ReadStateService {
    /// This chain is updated concurrently with requests,
    /// so it might include some block data that is also in `best_mem`.
    db: ZebraDb,
+
+    /// A shared handle to a task that writes blocks to the [`NonFinalizedState`] or [`FinalizedState`],
+    /// once the queues have received all their parent blocks.
+    ///
+    /// Used to check for panics when writing blocks.
+    block_write_task: Option<Arc<std::thread::JoinHandle<()>>>,
+}
+
+impl Drop for StateService {
+    fn drop(&mut self) {
+        // The state service owns the state, tasks, and channels,
+        // so dropping it should shut down everything.
+
+        // Close the channels (non-blocking)
+        // This makes the block write thread exit the next time it checks the channels.
+        // We want to do this here so we get any errors or panics from the block write task before it shuts down.
+        self.invalid_block_reset_receiver.close();
+
+        std::mem::drop(self.finalized_block_write_sender.take());
+        std::mem::drop(self.non_finalized_block_write_sender.take());
+
+        self.clear_finalized_block_queue("dropping the state: dropped unused queued block");
+
+        // Then drop self.read_service, which checks the block write task for panics,
+        // and tries to shut down the database.
+    }
+}
+
+impl Drop for ReadStateService {
+    fn drop(&mut self) {
+        // The read state service shares the state,
+        // so dropping it should check if we can shut down.
+
+        if let Some(block_write_task) = self.block_write_task.take() {
+            if let Ok(block_write_task_handle) = Arc::try_unwrap(block_write_task) {
+                // We're the last database user, so we can tell it to shut down (blocking):
+                // - flushes the database to disk, and
+                // - drops the database, which cleans up any database tasks correctly.
+                self.db.shutdown(true);
+
+                // We are the last state with a reference to this thread, so we can
+                // wait until the block write task finishes, then check for panics (blocking).
+                // (We'd also like to abort the thread, but std::thread::JoinHandle can't do that.)
+                info!("waiting for the block write task to finish");
+                if let Err(thread_panic) = block_write_task_handle.join() {
+                    std::panic::resume_unwind(thread_panic);
+                } else {
+                    info!("shutting down the state without waiting for the block write task");
+                }
+            }
+        } else {
+            // Even if we're not the last database user, try shutting it down.
+            //
+            // TODO: rename this to try_shutdown()?
+            self.db.shutdown(false);
+        }
+    }
 }

 impl StateService {
@ -205,12 +307,12 @@ impl StateService {
    ) -> (Self, ReadStateService, LatestChainTip, ChainTipChange) {
        let timer = CodeTimer::start();

-        let disk = FinalizedState::new(&config, network);
+        let finalized_state = FinalizedState::new(&config, network);
        timer.finish(module_path!(), line!(), "opening finalized state database");

        let timer = CodeTimer::start();
-        let initial_tip = disk
-            .db()
+        let initial_tip = finalized_state
+            .db
            .tip_block()
            .map(FinalizedBlock::from)
            .map(ChainTipBlock::from);
@ -219,26 +321,56 @@ impl StateService {
        let timer = CodeTimer::start();
        let (chain_tip_sender, latest_chain_tip, chain_tip_change) =
            ChainTipSender::new(initial_tip, network);
+        let chain_tip_sender = Arc::new(Mutex::new(chain_tip_sender));

-        let mem = NonFinalizedState::new(network);
+        let non_finalized_state = NonFinalizedState::new(network);

-        let (read_service, non_finalized_state_sender) = ReadStateService::new(&disk);
+        // Security: The number of blocks in these channels is limited by
+        //           the syncer and inbound lookahead limits.
+        let (non_finalized_block_write_sender, non_finalized_block_write_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let (finalized_block_write_sender, finalized_block_write_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+        let (invalid_block_reset_sender, invalid_block_reset_receiver) =
+            tokio::sync::mpsc::unbounded_channel();
+
+        let finalized_state_for_writing = finalized_state.clone();
+        let chain_tip_sender_for_writing = chain_tip_sender.clone();
+        let block_write_task = std::thread::spawn(move || {
+            write::write_blocks_from_channels(
+                finalized_block_write_receiver,
+                non_finalized_block_write_receiver,
+                finalized_state_for_writing,
+                invalid_block_reset_sender,
+                chain_tip_sender_for_writing,
+            )
+        });
+        let block_write_task = Arc::new(block_write_task);
+
+        let (read_service, non_finalized_state_sender) =
+            ReadStateService::new(&finalized_state, block_write_task);

        let queued_non_finalized_blocks = QueuedBlocks::default();
        let pending_utxos = PendingUtxos::default();

+        let last_block_hash_sent = finalized_state.db.finalized_tip_hash();
+
        let state = Self {
            network,
            queued_non_finalized_blocks,
            queued_finalized_blocks: HashMap::new(),
-            mem,
-            disk,
+            mem: non_finalized_state,
+            disk: finalized_state,
+            non_finalized_block_write_sender: Some(non_finalized_block_write_sender),
+            finalized_block_write_sender: Some(finalized_block_write_sender),
+            last_block_hash_sent,
+            invalid_block_reset_receiver,
            pending_utxos,
            last_prune: Instant::now(),
            chain_tip_sender,
            non_finalized_state_sender,
            read_service: read_service.clone(),
-            max_queued_height: f64::NAN,
+            max_queued_finalized_height: f64::NAN,
        };
        timer.finish(module_path!(), line!(), "initializing state service");

@ -256,7 +388,7 @@ impl StateService {
                state.network,
                MAX_LEGACY_CHAIN_BLOCKS,
            ) {
-                let legacy_db_path = state.disk.path().to_path_buf();
+                let legacy_db_path = state.read_service.db.path().to_path_buf();
                panic!(
                    "Cached state contains a legacy chain.\n\
                     An outdated Zebra version did not know about a recent network upgrade,\n\
@ -275,75 +407,147 @@ impl StateService {
    }

    /// Queue a finalized block for verification and storage in the finalized state.
+    ///
+    /// Returns a channel receiver that provides the result of the block commit.
    fn queue_and_commit_finalized(
        &mut self,
        finalized: FinalizedBlock,
    ) -> oneshot::Receiver<Result<block::Hash, BoxError>> {
+        // # Correctness & Performance
+        //
+        // This method must not block, access the database, or perform CPU-intensive tasks,
+        // because it is called directly from the tokio executor's Future threads.
+
+        let queued_prev_hash = finalized.block.header.previous_block_hash;
+        let queued_height = finalized.height;
+
        let (rsp_tx, rsp_rx) = oneshot::channel();
+        let queued = (finalized, rsp_tx);

-        // TODO: move this code into the state block commit task:
-        //   - queue_and_commit_finalized()'s commit_finalized() call becomes a send to the block commit channel
-        //   - run commit_finalized() in the state block commit task
-        //   - run the metrics update in queue_and_commit_finalized() in the block commit task
-        //   - run the set_finalized_tip() in this function in the state block commit task
-        //   - move all that code to the inner service
-        let tip_block = self
-            .drain_queue_and_commit_finalized((finalized, rsp_tx))
-            .map(ChainTipBlock::from);
-
-        self.chain_tip_sender.set_finalized_tip(tip_block);
-
-        rsp_rx
-    }
-
-    /// Queue a finalized block to be committed to the state.
-    ///
-    /// After queueing a finalized block, this method checks whether the newly
-    /// queued block (and any of its descendants) can be committed to the state.
-    ///
-    /// Returns the highest finalized tip block committed from the queue,
-    /// or `None` if no blocks were committed in this call.
-    /// (Use `tip_block` to get the finalized tip, regardless of when it was committed.)
-    pub fn drain_queue_and_commit_finalized(
-        &mut self,
-        queued: QueuedFinalized,
-    ) -> Option<FinalizedBlock> {
-        let mut highest_queue_commit = None;
-
-        let prev_hash = queued.0.block.header.previous_block_hash;
-        let height = queued.0.height;
-        self.queued_finalized_blocks.insert(prev_hash, queued);
-
-        while let Some(queued_block) = self
-            .queued_finalized_blocks
-            .remove(&self.disk.db().finalized_tip_hash())
-        {
-            if let Ok(finalized) = self.disk.commit_finalized(queued_block) {
-                highest_queue_commit = Some(finalized);
-            } else {
-                // the last block in the queue failed, so we can't commit the next block
-                break;
+        if self.finalized_block_write_sender.is_some() {
+            // We're still committing finalized blocks
+            if let Some(duplicate_queued) = self
+                .queued_finalized_blocks
+                .insert(queued_prev_hash, queued)
+            {
+                Self::send_finalized_block_error(
+                    duplicate_queued,
+                    "dropping older finalized block: got newer duplicate block",
+                );
            }
+
+            self.drain_queue_and_commit_finalized();
+        } else {
+            // We've finished committing finalized blocks, so drop any repeated queued blocks,
+            // and return an error.
+            //
+            // TODO: track the latest sent height, and drop any blocks under that height
+            //       every time we send some blocks (like QueuedNonFinalizedBlocks)
+            Self::send_finalized_block_error(
+                queued,
+                "already finished committing finalized blocks: dropped duplicate block, \
+                 block is already committed to the state",
+            );
+
+            self.clear_finalized_block_queue(
+                "already finished committing finalized blocks: dropped duplicate block, \
+                 block is already committed to the state",
+            );
        }

        if self.queued_finalized_blocks.is_empty() {
-            self.max_queued_height = f64::NAN;
-        } else if self.max_queued_height.is_nan() || self.max_queued_height < height.0 as f64 {
+            self.max_queued_finalized_height = f64::NAN;
+        } else if self.max_queued_finalized_height.is_nan()
+            || self.max_queued_finalized_height < queued_height.0 as f64
+        {
            // if there are still blocks in the queue, then either:
            //   - the new block was lower than the old maximum, and there was a gap before it,
            //     so the maximum is still the same (and we skip this code), or
            //   - the new block is higher than the old maximum, and there is at least one gap
            //     between the finalized tip and the new maximum
-            self.max_queued_height = height.0 as f64;
+            self.max_queued_finalized_height = queued_height.0 as f64;
        }

-        metrics::gauge!("state.checkpoint.queued.max.height", self.max_queued_height);
+        metrics::gauge!(
+            "state.checkpoint.queued.max.height",
+            self.max_queued_finalized_height
+        );
        metrics::gauge!(
            "state.checkpoint.queued.block.count",
            self.queued_finalized_blocks.len() as f64,
        );

-        highest_queue_commit
+        rsp_rx
+    }
+
+    /// Finds queued finalized blocks to be committed to the state in order,
+    /// removes them from the queue, and sends them to the block commit task.
+    ///
+    /// After queueing a finalized block, this method checks whether the newly
+    /// queued block (and any of its descendants) can be committed to the state.
+    ///
+    /// Returns an error if the block commit channel has been closed.
+    pub fn drain_queue_and_commit_finalized(&mut self) {
+        use tokio::sync::mpsc::error::{SendError, TryRecvError};
+
+        // # Correctness & Performance
+        //
+        // This method must not block, access the database, or perform CPU-intensive tasks,
+        // because it is called directly from the tokio executor's Future threads.
+
+        // If a block failed, we need to start again from a valid tip.
+        match self.invalid_block_reset_receiver.try_recv() {
+            Ok(reset_tip_hash) => self.last_block_hash_sent = reset_tip_hash,
+            Err(TryRecvError::Disconnected) => {
+                info!("Block commit task closed the block reset channel. Is Zebra shutting down?");
+                return;
+            }
+            // There are no errors, so we can just use the last block hash we sent
+            Err(TryRecvError::Empty) => {}
+        }
+
+        while let Some(queued_block) = self
+            .queued_finalized_blocks
+            .remove(&self.last_block_hash_sent)
+        {
+            self.last_block_hash_sent = queued_block.0.hash;
+
+            // If we've finished sending finalized blocks, ignore any repeated blocks.
+            // (Blocks can be repeated after a syncer reset.)
+            if let Some(finalized_block_write_sender) = &self.finalized_block_write_sender {
+                let send_result = finalized_block_write_sender.send(queued_block);
+
+                // If the receiver is closed, we can't send any more blocks.
+                if let Err(SendError(queued)) = send_result {
+                    // If Zebra is shutting down, drop blocks and return an error.
+                    Self::send_finalized_block_error(
+                        queued,
+                        "block commit task exited. Is Zebra shutting down?",
+                    );
+
+                    self.clear_finalized_block_queue(
+                        "block commit task exited. Is Zebra shutting down?",
+                    );
+                };
+            }
+        }
+    }
+
+    /// Drops all queued finalized blocks, and sends an error on their result channels.
+    fn clear_finalized_block_queue(&mut self, error: impl Into<BoxError> + Clone) {
+        for (_hash, queued) in self.queued_finalized_blocks.drain() {
+            Self::send_finalized_block_error(queued, error.clone());
+        }
+    }
+
+    /// Send an error on a `QueuedFinalized` block's result channel, and drop the block
+    fn send_finalized_block_error(queued: QueuedFinalized, error: impl Into<BoxError>) {
+        let (finalized, rsp_tx) = queued;
+
+        // The block sender might have already given up on this block,
+        // so ignore any channel send errors.
+        let _ = rsp_tx.send(Err(error.into()));
+        std::mem::drop(finalized);
    }

    /// Queue a non finalized block for verification and check if any queued
@ -362,7 +566,7 @@ impl StateService {
        let parent_hash = prepared.block.header.previous_block_hash;

        if self.mem.any_chain_contains(&prepared.hash)
-            || self.disk.db().hash(prepared.height).is_some()
+            || self.read_service.db.hash(prepared.height).is_some()
        {
            let (rsp_tx, rsp_rx) = oneshot::channel();
            let _ = rsp_tx.send(Err("block is already committed to the state".into()));
@ -386,6 +590,31 @@ impl StateService {
            rsp_rx
        };

+        // We've finished sending finalized blocks when:
+        // - we've sent the finalized block for the last checkpoint, and
+        // - it has been successfully written to disk.
+        //
+        // We detect the last checkpoint by looking for non-finalized blocks
+        // that are a child of the last block we sent.
+        //
+        // TODO: configure the state with the last checkpoint hash instead?
+        if self.finalized_block_write_sender.is_some()
+            && self
+                .queued_non_finalized_blocks
+                .has_queued_children(self.last_block_hash_sent)
+            && self.read_service.db.finalized_tip_hash() == self.last_block_hash_sent
+        {
+            // Tell the block write task to stop committing finalized blocks,
+            // and move on to committing non-finalized blocks.
+            std::mem::drop(self.finalized_block_write_sender.take());
+
+            // We've finished committing finalized blocks, so drop any repeated queued blocks.
+            self.clear_finalized_block_queue(
+                "already finished committing finalized blocks: dropped duplicate block, \
+                 block is already committed to the state",
+            );
+        }
+
        // TODO: avoid a temporary verification failure that can happen
        //       if the first non-finalized block arrives before the last finalized block is committed
        //       (#5125)
@ -411,7 +640,7 @@ impl StateService {
                );
        }

-        let finalized_tip_height = self.disk.db().finalized_tip_height().expect(
+        let finalized_tip_height = self.read_service.db.finalized_tip_height().expect(
            "Finalized state must have at least one block before committing non-finalized state",
        );
        self.queued_non_finalized_blocks
@ -447,6 +676,9 @@ impl StateService {
    /// non-finalized state is empty.
    ///
    /// [1]: non_finalized_state::Chain
+    //
+    // TODO: remove this clippy allow when we remove self.chain_tip_sender
+    #[allow(clippy::unwrap_in_result)]
    #[instrument(level = "debug", skip(self))]
    fn update_latest_chain_channels(&mut self) -> Option<block::Height> {
        let best_chain = self.mem.best_chain();
@ -459,7 +691,10 @@ impl StateService {
        // If the final receiver was just dropped, ignore the error.
        let _ = self.non_finalized_state_sender.send(self.mem.clone());

-        self.chain_tip_sender.set_best_non_finalized_tip(tip_block);
+        self.chain_tip_sender
+            .lock()
+            .expect("unexpected panic in block commit task or state")
+            .set_best_non_finalized_tip(tip_block);

        tip_block_height
    }
@ -471,10 +706,10 @@ impl StateService {
        self.check_contextual_validity(&prepared)?;
        let parent_hash = prepared.block.header.previous_block_hash;

-        if self.disk.db().finalized_tip_hash() == parent_hash {
-            self.mem.commit_new_chain(prepared, self.disk.db())?;
+        if self.disk.db.finalized_tip_hash() == parent_hash {
+            self.mem.commit_new_chain(prepared, &self.disk.db)?;
        } else {
-            self.mem.commit_block(prepared, self.disk.db())?;
+            self.mem.commit_block(prepared, &self.disk.db)?;
        }

        Ok(())
@ -482,7 +717,7 @@ impl StateService {

    /// Returns `true` if `hash` is a valid previous block hash for new non-finalized blocks.
    fn can_fork_chain_at(&self, hash: &block::Hash) -> bool {
-        self.mem.any_chain_contains(hash) || &self.disk.db().finalized_tip_hash() == hash
+        self.mem.any_chain_contains(hash) || &self.read_service.db.finalized_tip_hash() == hash
    }

    /// Attempt to validate and commit all queued blocks whose parents have
@ -547,25 +782,25 @@ impl StateService {
        check::block_is_valid_for_recent_chain(
            prepared,
            self.network,
-            self.disk.db().finalized_tip_height(),
+            self.disk.db.finalized_tip_height(),
            relevant_chain,
        )?;

-        check::nullifier::no_duplicates_in_finalized_chain(prepared, self.disk.db())?;
+        check::nullifier::no_duplicates_in_finalized_chain(prepared, &self.disk.db)?;

        Ok(())
    }

    /// Return the tip of the current best chain.
    pub fn best_tip(&self) -> Option<(block::Height, block::Hash)> {
-        self.mem.best_tip().or_else(|| self.disk.db().tip())
+        self.mem.best_tip().or_else(|| self.read_service.db.tip())
    }

    /// Return the height for the block at `hash` in any chain.
    pub fn any_height_by_hash(&self, hash: block::Hash) -> Option<block::Height> {
        self.mem
            .any_height_by_hash(hash)
-            .or_else(|| self.disk.db().height(hash))
+            .or_else(|| self.read_service.db.height(hash))
    }

    /// Return an iterator over the relevant chain of the block identified by
@ -593,18 +828,23 @@ impl StateService {
 }

 impl ReadStateService {
-    /// Creates a new read-only state service, using the provided finalized state.
+    /// Creates a new read-only state service, using the provided finalized state and
+    /// block write task handle.
    ///
    /// Returns the newly created service,
    /// and a watch channel for updating the shared recent non-finalized chain.
-    pub(crate) fn new(disk: &FinalizedState) -> (Self, watch::Sender<NonFinalizedState>) {
+    pub(crate) fn new(
+        finalized_state: &FinalizedState,
+        block_write_task: Arc<std::thread::JoinHandle<()>>,
+    ) -> (Self, watch::Sender<NonFinalizedState>) {
        let (non_finalized_state_sender, non_finalized_state_receiver) =
-            watch::channel(NonFinalizedState::new(disk.network()));
+            watch::channel(NonFinalizedState::new(finalized_state.network()));

        let read_service = Self {
-            network: disk.network(),
-            db: disk.db().clone(),
+            network: finalized_state.network(),
+            db: finalized_state.db.clone(),
            non_finalized_state_receiver: WatchReceiver::new(non_finalized_state_receiver),
+            block_write_task: Some(block_write_task),
        };

        tracing::info!("created new read-only state service");
@ -619,7 +859,11 @@ impl Service<Request> for StateService {
    type Future =
        Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send + 'static>>;

-    fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+    fn poll_ready(&mut self, cx: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        // Check for panics in the block write task
+        let poll = self.read_service.poll_ready(cx);
+
+        // Prune outdated UTXO requests
        let now = Instant::now();

        if self.last_prune + Self::PRUNE_INTERVAL < now {
@ -646,7 +890,7 @@ impl Service<Request> for StateService {
            }
        }

-        Poll::Ready(Ok(()))
+        poll
    }

    #[instrument(name = "state", skip(self, req))]
@ -679,6 +923,10 @@ impl Service<Request> for StateService {
                    span.in_scope(|| self.queue_and_commit_non_finalized(prepared))
                });

+                // TODO:
+                //   - check for panics in the block write task here,
+                //     as well as in poll_ready()
+
                // The work is all done, the future just waits on a channel for the result
                timer.finish(module_path!(), line!(), "CommitBlock");

@ -700,7 +948,7 @@ impl Service<Request> for StateService {
            }

            // Uses queued_finalized_blocks and pending_utxos in the StateService.
-            // Accesses shared writeable state in the StateService and ZebraDb.
+            // Accesses shared writeable state in the StateService.
            Request::CommitFinalizedBlock(finalized) => {
                let timer = CodeTimer::start();

@ -716,14 +964,13 @@ impl Service<Request> for StateService {

                // # Performance
                //
-                // Allow other async tasks to make progress while blocks are being verified
-                // and written to disk.
-                //
-                // See the note in `CommitBlock` for more details.
-                let span = Span::current();
-                let rsp_rx = tokio::task::block_in_place(move || {
-                    span.in_scope(|| self.queue_and_commit_finalized(finalized))
-                });
+                // This method doesn't block, access the database, or perform CPU-intensive tasks,
+                // so we can run it directly in the tokio executor's Future threads.
+                let rsp_rx = self.queue_and_commit_finalized(finalized);
+
+                // TODO:
+                //   - check for panics in the block write task here,
+                //     as well as in poll_ready()

                // The work is all done, the future just waits on a channel for the result
                timer.finish(module_path!(), line!(), "CommitFinalizedBlock");
@ -847,6 +1094,27 @@ impl Service<ReadRequest> for ReadStateService {
        Pin<Box<dyn Future<Output = Result<Self::Response, Self::Error>> + Send + 'static>>;

    fn poll_ready(&mut self, _: &mut Context<'_>) -> Poll<Result<(), Self::Error>> {
+        // Check for panics in the block write task
+        let block_write_task = self.block_write_task.take();
+
+        if let Some(block_write_task) = block_write_task {
+            if block_write_task.is_finished() {
+                match Arc::try_unwrap(block_write_task) {
+                    // We are the last state with a reference to this task, so we can propagate any panics
+                    Ok(block_write_task_handle) => {
+                        if let Err(thread_panic) = block_write_task_handle.join() {
+                            std::panic::resume_unwind(thread_panic);
+                        }
+                    }
+                    // We're not the last state, so we need to put it back
+                    Err(arc_block_write_task) => self.block_write_task = Some(arc_block_write_task),
+                }
+            } else {
+                // It hasn't finished, so we need to put it back
+                self.block_write_task = Some(block_write_task);
+            }
+        }
+
        Poll::Ready(Ok(()))
    }

--- a/zebra-state/src/service/arbitrary.rs
+++ b/zebra-state/src/service/arbitrary.rs
@ -1,6 +1,6 @@
 //! Arbitrary data generation and test setup for Zebra's state.

-use std::sync::Arc;
+use std::{sync::Arc, time::Duration};

 use futures::{stream::FuturesUnordered, StreamExt};
 use proptest::{
@ -9,11 +9,12 @@ use proptest::{
    strategy::{NewTree, ValueTree},
    test_runner::TestRunner,
 };
+use tokio::time::timeout;
 use tower::{buffer::Buffer, util::BoxService, Service, ServiceExt};

 use zebra_chain::{
    block::Block,
-    fmt::SummaryDebug,
+    fmt::{humantime_seconds, SummaryDebug},
    history_tree::HistoryTree,
    parameters::{Network, NetworkUpgrade},
    LedgerState,
@ -27,6 +28,9 @@ use crate::{

 pub use zebra_chain::block::arbitrary::MAX_PARTIAL_CHAIN_BLOCKS;

+/// How long we wait for chain tip updates before skipping them.
+pub const CHAIN_TIP_UPDATE_WAIT_LIMIT: Duration = Duration::from_secs(2);
+
 #[derive(Debug)]
 pub struct PreparedChainTree {
    chain: Arc<SummaryDebug<Vec<PreparedBlock>>>,
@ -197,7 +201,7 @@ pub async fn populated_state(
        .into_iter()
        .map(|block| Request::CommitFinalizedBlock(block.into()));

-    let (state, read_state, latest_chain_tip, chain_tip_change) =
+    let (state, read_state, latest_chain_tip, mut chain_tip_change) =
        StateService::new(Config::ephemeral(), network);
    let mut state = Buffer::new(BoxService::new(state), 1);

@ -209,7 +213,24 @@ pub async fn populated_state(
    }

    while let Some(rsp) = responses.next().await {
-        rsp.expect("blocks should commit just fine");
+        // Wait for the block result and the chain tip update,
+        // which both happen in a separate thread from this one.
+        rsp.expect("unexpected block commit failure");
+
+        // Wait for the chain tip update
+        if let Err(timeout_error) = timeout(
+            CHAIN_TIP_UPDATE_WAIT_LIMIT,
+            chain_tip_change.wait_for_tip_change(),
+        )
+        .await
+        .map(|change_result| change_result.expect("unexpected chain tip update failure"))
+        {
+            info!(
+                timeout = ?humantime_seconds(CHAIN_TIP_UPDATE_WAIT_LIMIT),
+                ?timeout_error,
+                "timeout waiting for chain tip change after committing block"
+            );
+        }
    }

    (state, read_state, latest_chain_tip, chain_tip_change)
--- a/zebra-state/src/service/block_iter.rs
+++ b/zebra-state/src/service/block_iter.rs
@ -49,7 +49,7 @@ impl Iter<'_> {
            IterState::Finished => unreachable!(),
        };

-        if let Some(block) = service.disk.db().block(hash_or_height) {
+        if let Some(block) = service.read_service.db.block(hash_or_height) {
            let height = block
                .coinbase_height()
                .expect("valid blocks have a coinbase height");
--- a/zebra-state/src/service/finalized_state.rs
+++ b/zebra-state/src/service/finalized_state.rs
@ -17,7 +17,6 @@

 use std::{
    io::{stderr, stdout, Write},
-    path::Path,
    sync::Arc,
 };

@ -46,8 +45,11 @@ pub(super) use zebra_db::ZebraDb;
 /// The finalized part of the chain state, stored in the db.
 ///
 /// `rocksdb` allows concurrent writes through a shared reference,
-/// so finalized state instances are cloneable. When the final clone is dropped,
-/// the database is closed.
+/// so clones of the finalized state represent the same database instance.
+/// When the final clone is dropped, the database is closed.
+///
+/// This is different from `NonFinalizedState::clone()`,
+/// which returns an independent copy of the chains.
 #[derive(Clone, Debug, Eq, PartialEq)]
 pub struct FinalizedState {
    // Configuration
@ -72,7 +74,7 @@ pub struct FinalizedState {
    /// `rocksdb` allows reads and writes via a shared reference,
    /// so this database object can be freely cloned.
    /// The last instance that is dropped will close the underlying database.
-    db: ZebraDb,
+    pub db: ZebraDb,
 }

 impl FinalizedState {
@ -134,29 +136,19 @@ impl FinalizedState {
        self.network
    }

-    /// Returns the `Path` where the files used by this database are located.
-    pub fn path(&self) -> &Path {
-        self.db.path()
-    }
-
-    /// Returns a reference to the inner database instance.
-    pub(crate) fn db(&self) -> &ZebraDb {
-        &self.db
-    }
-
    /// Commit a finalized block to the state.
    ///
    /// It's the caller's responsibility to ensure that blocks are committed in
    /// order.
    pub fn commit_finalized(
        &mut self,
-        queued_block: QueuedFinalized,
-    ) -> Result<FinalizedBlock, ()> {
-        let (finalized, rsp_tx) = queued_block;
+        ordered_block: QueuedFinalized,
+    ) -> Result<FinalizedBlock, BoxError> {
+        let (finalized, rsp_tx) = ordered_block;
        let result =
            self.commit_finalized_direct(finalized.clone().into(), "CommitFinalized request");

-        let block_result = if result.is_ok() {
+        if result.is_ok() {
            metrics::counter!("state.checkpoint.finalized.block.count", 1);
            metrics::gauge!(
                "state.checkpoint.finalized.block.height",
@ -171,21 +163,23 @@ impl FinalizedState {
                finalized.height.0 as f64,
            );
            metrics::counter!("zcash.chain.verified.block.total", 1);
-
-            Ok(finalized)
        } else {
            metrics::counter!("state.checkpoint.error.block.count", 1);
            metrics::gauge!(
                "state.checkpoint.error.block.height",
                finalized.height.0 as f64,
            );
-
-            Err(())
        };

-        let _ = rsp_tx.send(result.map_err(Into::into));
+        // Some io errors can't be cloned, so we format them instead.
+        let owned_result = result
+            .as_ref()
+            .map(|_hash| finalized)
+            .map_err(|error| format!("{:?}", error).into());

-        block_result
+        let _ = rsp_tx.send(result);
+
+        owned_result
    }

    /// Immediately commit a `finalized` block to the finalized state.
--- a/zebra-state/src/service/finalized_state/arbitrary.rs
+++ b/zebra-state/src/service/finalized_state/arbitrary.rs
@ -14,7 +14,7 @@ impl Deref for FinalizedState {
    type Target = ZebraDb;

    fn deref(&self) -> &Self::Target {
-        self.db()
+        &self.db
    }
 }

--- a/zebra-state/src/service/finalized_state/disk_db.rs
+++ b/zebra-state/src/service/finalized_state/disk_db.rs
@ -643,23 +643,49 @@ impl DiskDb {
    /// It should only be used in debugging or test code, immediately before a manual shutdown.
    ///
    /// TODO: make private after the stop height check has moved to the syncer (#3442)
-    ///       move shutting down the database to a blocking thread (#2188),
-    ///            and remove `force` and the manual flush
+    ///       move shutting down the database to a blocking thread (#2188)
    pub(crate) fn shutdown(&mut self, force: bool) {
-        // Prevent a race condition where another thread clones the Arc,
-        // right after we've checked we're the only holder of the Arc.
+        // # Correctness
        //
-        // There is still a small race window after the guard is dropped,
-        // but if the race happens, it will only cause database errors during shutdown.
-        let clone_prevention_guard = Arc::get_mut(&mut self.db);
+        // If we're the only owner of the shared database instance,
+        // then there are no other threads that can increase the strong or weak count.
+        //
+        // ## Implementation Requirements
+        //
+        // This function and all functions that it calls should avoid cloning the shared database
+        // instance. If they do, they must drop it before:
+        // - shutting down database threads, or
+        // - deleting database files.
+        let shared_database_owners = Arc::strong_count(&self.db) + Arc::weak_count(&self.db);

-        if clone_prevention_guard.is_none() && !force {
-            debug!(
-                "dropping cloned DiskDb, \
-                 but keeping shared database until the last reference is dropped",
-            );
+        if shared_database_owners > 1 {
+            let path = self.path();

-            return;
+            let mut ephemeral_note = "";
+
+            if force {
+                if self.ephemeral {
+                    ephemeral_note = " and removing ephemeral files";
+                }
+
+                info!(
+                    ?path,
+                    "forcing shutdown{} of a state database with multiple active instances",
+                    ephemeral_note,
+                );
+            } else {
+                if self.ephemeral {
+                    ephemeral_note = " and files";
+                }
+
+                debug!(
+                    ?path,
+                    "dropping DiskDb clone, \
+                     but keeping shared database instance{} until the last reference is dropped",
+                    ephemeral_note,
+                );
+                return;
+            }
        }

        self.assert_default_cf_is_empty();
@ -670,17 +696,29 @@ impl DiskDb {
        // - the database flushes regularly anyway
        // - Zebra commits each block in a database transaction, any incomplete blocks get rolled back
        // - ephemeral files are placed in the os temp dir and should be cleaned up automatically eventually
-        info!("flushing database to disk");
-        self.db.flush().expect("flush is successful");
+        let path = self.path();
+        info!(?path, "flushing database to disk");
+        self.db
+            .flush()
+            .expect("unexpected failure flushing SST data to disk");
+        self.db
+            .flush_wal(true)
+            .expect("unexpected failure flushing WAL data to disk");

-        // But we should call `cancel_all_background_work` before Zebra exits.
-        // If we don't, we see these kinds of errors:
+        // We'd like to call `cancel_all_background_work()` before Zebra exits,
+        // but when we call it, we get memory, thread, or C++ errors when the process exits.
+        // (This seems to be a bug in RocksDB: cancel_all_background_work() should wait until
+        // all the threads have cleaned up.)
+        //
+        // We see these kinds of errors:
        // ```
        // pthread lock: Invalid argument
        // pure virtual method called
        // terminate called without an active exception
        // pthread destroy mutex: Device or resource busy
        // Aborted (core dumped)
+        // signal: 6, SIGABRT: process abort signal
+        // signal: 11, SIGSEGV: invalid memory reference
        // ```
        //
        // The RocksDB wiki says:
@ -690,8 +728,8 @@ impl DiskDb {
        // > You can speed up the waiting by calling CancelAllBackgroundWork().
        //
        // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ
-        info!("stopping background database tasks");
-        self.db.cancel_all_background_work(true);
+        //info!(?path, "stopping background database tasks");
+        //self.db.cancel_all_background_work(true);

        // We'd like to drop the database before deleting its files,
        // because that closes the column families and the database correctly.
@ -705,57 +743,52 @@ impl DiskDb {
        //
        // https://github.com/facebook/rocksdb/wiki/Known-Issues
        //
-        // But our current code doesn't seem to cause any issues.
-        // We might want to explicitly drop the database as part of graceful shutdown (#1678).
-        self.delete_ephemeral(force);
+        // But this implementation doesn't seem to cause any issues,
+        // and the RocksDB Drop implementation handles any cleanup.
+        self.delete_ephemeral();
    }

-    /// If the database is `ephemeral`, delete it.
-    ///
-    /// If `force` is true, clean up regardless of any shared references.
-    /// `force` can cause errors accessing the database from other shared references.
-    /// It should only be used in debugging or test code, immediately before a manual shutdown.
-    fn delete_ephemeral(&mut self, force: bool) {
+    /// If the database is `ephemeral`, delete its files.
+    fn delete_ephemeral(&mut self) {
+        // # Correctness
+        //
+        // This function and all functions that it calls should avoid cloning the shared database
+        // instance. See `shutdown()` for details.
+
        if !self.ephemeral {
            return;
        }

-        // Prevent a race condition where another thread clones the Arc,
-        // right after we've checked we're the only holder of the Arc.
-        //
-        // There is still a small race window after the guard is dropped,
-        // but if the race happens, it will only cause database errors during shutdown.
-        let clone_prevention_guard = Arc::get_mut(&mut self.db);
-
-        if clone_prevention_guard.is_none() && !force {
-            debug!(
-                "dropping cloned DiskDb, \
-                 but keeping shared database files until the last reference is dropped",
-            );
-
-            return;
-        }
-
        let path = self.path();
-        info!(cache_path = ?path, "removing temporary database files");
+        info!(?path, "removing temporary database files");

        // We'd like to use `rocksdb::Env::mem_env` for ephemeral databases,
        // but the Zcash blockchain might not fit in memory. So we just
        // delete the database files instead.
        //
-        // We'd like to call `DB::destroy` here, but calling destroy on a
+        // We'd also like to call `DB::destroy` here, but calling destroy on a
        // live DB is undefined behaviour:
        // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ#basic-readwrite
        //
        // So we assume that all the database files are under `path`, and
        // delete them using standard filesystem APIs. Deleting open files
        // might cause errors on non-Unix platforms, so we ignore the result.
-        // (The OS will delete them eventually anyway.)
-        let res = std::fs::remove_dir_all(path);
+        // (The OS will delete them eventually anyway, if they are in a temporary directory.)
+        let result = std::fs::remove_dir_all(path);

-        // TODO: downgrade to debug once bugs like #2905 are fixed
-        //       but leave any errors at "info" level
-        info!(?res, "removed temporary database files");
+        if result.is_err() {
+            info!(
+                ?result,
+                ?path,
+                "removing temporary database files caused an error",
+            );
+        } else {
+            debug!(
+                ?result,
+                ?path,
+                "successfully removed temporary database files",
+            );
+        }
    }

    /// Check that the "default" column family is empty.
@ -764,6 +797,11 @@ impl DiskDb {
    ///
    /// If Zebra has a bug where it is storing data in the wrong column family.
    fn assert_default_cf_is_empty(&self) {
+        // # Correctness
+        //
+        // This function and all functions that it calls should avoid cloning the shared database
+        // instance. See `shutdown()` for details.
+
        if let Some(default_cf) = self.cf_handle("default") {
            assert!(
                self.zs_is_empty(&default_cf),
@ -775,6 +813,9 @@ impl DiskDb {

 impl Drop for DiskDb {
    fn drop(&mut self) {
+        let path = self.path();
+        debug!(?path, "dropping DiskDb instance");
+
        self.shutdown(false);
    }
 }
--- a/zebra-state/src/service/non_finalized_state.rs
+++ b/zebra-state/src/service/non_finalized_state.rs
@ -30,7 +30,13 @@ mod tests;
 pub(crate) use chain::Chain;

 /// The state of the chains in memory, including queued blocks.
-#[derive(Debug, Clone)]
+///
+/// Clones of the non-finalized state contain independent copies of the chains.
+/// This is different from `FinalizedState::clone()`,
+/// which returns a shared reference to the database.
+///
+/// Most chain data is clone-on-write using [`Arc`].
+#[derive(Clone, Debug)]
 pub struct NonFinalizedState {
    /// Verified, non-finalized chains, in ascending order.
    ///
--- a/zebra-state/src/service/pending_utxos.rs
+++ b/zebra-state/src/service/pending_utxos.rs
@ -1,5 +1,6 @@
-use std::collections::HashMap;
-use std::future::Future;
+//! Pending UTXO tracker for [`AwaitUtxo` requests](crate::Request::AwaitUtxo).
+
+use std::{collections::HashMap, future::Future};

 use tokio::sync::broadcast;

--- a/zebra-state/src/service/queued_blocks.rs
+++ b/zebra-state/src/service/queued_blocks.rs
@ -77,6 +77,12 @@ impl QueuedBlocks {
        self.update_metrics();
    }

+    /// Returns `true` if there are any queued children of `parent_hash`.
+    #[instrument(skip(self), fields(%parent_hash))]
+    pub fn has_queued_children(&self, parent_hash: block::Hash) -> bool {
+        self.by_parent.contains_key(&parent_hash)
+    }
+
    /// Dequeue and return all blocks that were waiting for the arrival of
    /// `parent`.
    #[instrument(skip(self), fields(%parent_hash))]
--- a/zebra-state/src/service/read/address/balance.rs
+++ b/zebra-state/src/service/read/address/balance.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` from the latest chain. Then it can commit additional blocks to
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` with a cached copy of the best non-finalized chain from
+//! `NonFinalizedState.chain_set`. Then the block commit task can commit additional blocks to
 //! the finalized state after we've cloned the `chain`.
 //!
 //! This means that some blocks can be in both:
--- a/zebra-state/src/service/read/address/tx_id.rs
+++ b/zebra-state/src/service/read/address/tx_id.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` from the latest chain. Then it can commit additional blocks to
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` with a cached copy of the best non-finalized chain from
+//! `NonFinalizedState.chain_set`. Then the block commit task can commit additional blocks to
 //! the finalized state after we've cloned the `chain`.
 //!
 //! This means that some blocks can be in both:
--- a/zebra-state/src/service/read/address/utxo.rs
+++ b/zebra-state/src/service/read/address/utxo.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` from the latest chain. Then it can commit additional blocks to
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` with a cached copy of the best non-finalized chain from
+//! `NonFinalizedState.chain_set`. Then the block commit task can commit additional blocks to
 //! the finalized state after we've cloned the `chain`.
 //!
 //! This means that some blocks can be in both:
--- a/zebra-state/src/service/read/block.rs
+++ b/zebra-state/src/service/read/block.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` or `non_finalized_state` from the latest chains. Then it can
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` or `non_finalized_state` with a cached copy of the non-finalized chains
+//! in `NonFinalizedState.chain_set`. Then the block commit task can
 //! commit additional blocks to the finalized state after we've cloned the
 //! `chain` or `non_finalized_state`.
 //!
--- a/zebra-state/src/service/read/find.rs
+++ b/zebra-state/src/service/read/find.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` from the latest chain. Then it can commit additional blocks to
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` with a cached copy of the best non-finalized chain from
+//! `NonFinalizedState.chain_set`. Then the block commit task can commit additional blocks to
 //! the finalized state after we've cloned the `chain`.
 //!
 //! This means that some blocks can be in both:
--- a/zebra-state/src/service/read/tree.rs
+++ b/zebra-state/src/service/read/tree.rs
@ -2,8 +2,9 @@
 //!
 //! In the functions in this module:
 //!
-//! The StateService commits blocks to the finalized state before updating
-//! `chain` from the latest chain. Then it can commit additional blocks to
+//! The block write task commits blocks to the finalized state before updating
+//! `chain` with a cached copy of the best non-finalized chain from
+//! `NonFinalizedState.chain_set`. Then the block commit task can commit additional blocks to
 //! the finalized state after we've cloned the `chain`.
 //!
 //! This means that some blocks can be in both:
--- a/zebra-state/src/service/tests.rs
+++ b/zebra-state/src/service/tests.rs
@ -2,7 +2,7 @@
 //!
 //! TODO: move these tests into tests::vectors and tests::prop modules.

-use std::{env, sync::Arc};
+use std::{env, sync::Arc, time::Duration};

 use tower::{buffer::Buffer, util::BoxService};

@ -386,59 +386,6 @@ proptest! {
        prop_assert_eq!(response, Ok(()));
    }

-    /// Test that the best tip height is updated accordingly.
-    ///
-    /// 1. Generate a finalized chain and some non-finalized blocks.
-    /// 2. Check that initially the best tip height is empty.
-    /// 3. Commit the finalized blocks and check that the best tip height is updated accordingly.
-    /// 4. Commit the non-finalized blocks and check that the best tip height is also updated
-    ///    accordingly.
-    #[test]
-    fn chain_tip_sender_is_updated(
-        (network, finalized_blocks, non_finalized_blocks)
-            in continuous_empty_blocks_from_test_vectors(),
-    ) {
-        let _init_guard = zebra_test::init();
-
-        let (mut state_service, _read_only_state_service, latest_chain_tip, mut chain_tip_change) = StateService::new(Config::ephemeral(), network);
-
-        prop_assert_eq!(latest_chain_tip.best_tip_height(), None);
-        prop_assert_eq!(chain_tip_change.last_tip_change(), None);
-
-        for block in finalized_blocks {
-            let expected_block = block.clone();
-
-            let expected_action = if expected_block.height <= block::Height(1) {
-                // 0: reset by both initialization and the Genesis network upgrade
-                // 1: reset by the BeforeOverwinter network upgrade
-                TipAction::reset_with(expected_block.clone().into())
-            } else {
-                TipAction::grow_with(expected_block.clone().into())
-            };
-
-            state_service.queue_and_commit_finalized(block);
-
-            prop_assert_eq!(latest_chain_tip.best_tip_height(), Some(expected_block.height));
-            prop_assert_eq!(chain_tip_change.last_tip_change(), Some(expected_action));
-        }
-
-        for block in non_finalized_blocks {
-            let expected_block = block.clone();
-
-            let expected_action = if expected_block.height == block::Height(1) {
-                // 1: reset by the BeforeOverwinter network upgrade
-                TipAction::reset_with(expected_block.clone().into())
-            } else {
-                TipAction::grow_with(expected_block.clone().into())
-            };
-
-            state_service.queue_and_commit_non_finalized(block);
-
-            prop_assert_eq!(latest_chain_tip.best_tip_height(), Some(expected_block.height));
-            prop_assert_eq!(chain_tip_change.last_tip_change(), Some(expected_action));
-        }
-    }
-
    /// Test that the value pool is updated accordingly.
    ///
    /// 1. Generate a finalized chain and some non-finalized blocks.
@ -476,7 +423,10 @@ proptest! {
                expected_finalized_value_pool += *block_value_pool;
            }

-            state_service.queue_and_commit_finalized(block.clone());
+            let result_receiver = state_service.queue_and_commit_finalized(block.clone());
+            let result = result_receiver.blocking_recv();
+
+            prop_assert!(result.is_ok(), "unexpected failed finalized block commit: {:?}", result);

            prop_assert_eq!(
                state_service.disk.finalized_value_pool(),
@ -499,7 +449,10 @@ proptest! {
            let block_value_pool = &block.block.chain_value_pool_change(&transparent::utxos_from_ordered_utxos(utxos))?;
            expected_non_finalized_value_pool += *block_value_pool;

-            state_service.queue_and_commit_non_finalized(block.clone());
+            let result_receiver = state_service.queue_and_commit_non_finalized(block.clone());
+            let result = result_receiver.blocking_recv();
+
+            prop_assert!(result.is_ok(), "unexpected failed non-finalized block commit: {:?}", result);

            prop_assert_eq!(
                state_service.mem.best_chain().unwrap().chain_value_pools,
@ -518,6 +471,80 @@ proptest! {
    }
 }

+// This test sleeps for every block, so we only ever want to run it once
+proptest! {
+    #![proptest_config(
+        proptest::test_runner::Config::with_cases(1)
+    )]
+
+    /// Test that the best tip height is updated accordingly.
+    ///
+    /// 1. Generate a finalized chain and some non-finalized blocks.
+    /// 2. Check that initially the best tip height is empty.
+    /// 3. Commit the finalized blocks and check that the best tip height is updated accordingly.
+    /// 4. Commit the non-finalized blocks and check that the best tip height is also updated
+    ///    accordingly.
+    #[test]
+    fn chain_tip_sender_is_updated(
+        (network, finalized_blocks, non_finalized_blocks)
+            in continuous_empty_blocks_from_test_vectors(),
+    ) {
+        let _init_guard = zebra_test::init();
+
+        let (mut state_service, _read_only_state_service, latest_chain_tip, mut chain_tip_change) = StateService::new(Config::ephemeral(), network);
+
+        prop_assert_eq!(latest_chain_tip.best_tip_height(), None);
+        prop_assert_eq!(chain_tip_change.last_tip_change(), None);
+
+        for block in finalized_blocks {
+            let expected_block = block.clone();
+
+            let expected_action = if expected_block.height <= block::Height(1) {
+                // 0: reset by both initialization and the Genesis network upgrade
+                // 1: reset by the BeforeOverwinter network upgrade
+                TipAction::reset_with(expected_block.clone().into())
+            } else {
+                TipAction::grow_with(expected_block.clone().into())
+            };
+
+            let result_receiver = state_service.queue_and_commit_finalized(block);
+            let result = result_receiver.blocking_recv();
+
+            prop_assert!(result.is_ok(), "unexpected failed finalized block commit: {:?}", result);
+
+            // Wait for the channels to be updated by the block commit task.
+            // TODO: add a blocking method on ChainTipChange
+            std::thread::sleep(Duration::from_secs(1));
+
+            prop_assert_eq!(latest_chain_tip.best_tip_height(), Some(expected_block.height));
+            prop_assert_eq!(chain_tip_change.last_tip_change(), Some(expected_action));
+        }
+
+        for block in non_finalized_blocks {
+            let expected_block = block.clone();
+
+            let expected_action = if expected_block.height == block::Height(1) {
+                // 1: reset by the BeforeOverwinter network upgrade
+                TipAction::reset_with(expected_block.clone().into())
+            } else {
+                TipAction::grow_with(expected_block.clone().into())
+            };
+
+            let result_receiver = state_service.queue_and_commit_non_finalized(block);
+            let result = result_receiver.blocking_recv();
+
+            prop_assert!(result.is_ok(), "unexpected failed non-finalized block commit: {:?}", result);
+
+            // Wait for the channels to be updated by the block commit task.
+            // TODO: add a blocking method on ChainTipChange
+            std::thread::sleep(Duration::from_secs(1));
+
+            prop_assert_eq!(latest_chain_tip.best_tip_height(), Some(expected_block.height));
+            prop_assert_eq!(chain_tip_change.last_tip_change(), Some(expected_action));
+        }
+    }
+}
+
 /// Test strategy to generate a chain split in two from the test vectors.
 ///
 /// Selects either the mainnet or testnet chain test vector and randomly splits the chain in two
--- a/zebra-state/src/service/write.rs
+++ b/zebra-state/src/service/write.rs
@ -0,0 +1,138 @@
+//! Writing blocks to the finalized and non-finalized states.
+
+use std::sync::{Arc, Mutex};
+
+use zebra_chain::block::{self, Height};
+
+use crate::service::{
+    finalized_state::FinalizedState,
+    queued_blocks::{QueuedFinalized, QueuedNonFinalized},
+    ChainTipBlock, ChainTipSender,
+};
+
+/// Reads blocks from the channels, writes them to the `finalized_state`,
+/// and updates the `chain_tip_sender`.
+///
+/// TODO: pass the non-finalized state and associated update channel to this function
+#[instrument(skip(
+    finalized_block_write_receiver,
+    non_finalized_block_write_receiver,
+    invalid_block_reset_sender,
+    chain_tip_sender
+))]
+pub fn write_blocks_from_channels(
+    mut finalized_block_write_receiver: tokio::sync::mpsc::UnboundedReceiver<QueuedFinalized>,
+    mut non_finalized_block_write_receiver: tokio::sync::mpsc::UnboundedReceiver<
+        QueuedNonFinalized,
+    >,
+    mut finalized_state: FinalizedState,
+    invalid_block_reset_sender: tokio::sync::mpsc::UnboundedSender<block::Hash>,
+    chain_tip_sender: Arc<Mutex<ChainTipSender>>,
+) {
+    // Write all the finalized blocks sent by the state,
+    // until the state closes the finalized block channel's sender.
+    while let Some(ordered_block) = finalized_block_write_receiver.blocking_recv() {
+        // TODO: split these checks into separate functions
+
+        if invalid_block_reset_sender.is_closed() {
+            info!("StateService closed the block reset channel. Is Zebra shutting down?");
+            return;
+        }
+
+        // Discard any children of invalid blocks in the channel
+        //
+        // `commit_finalized()` requires blocks in height order.
+        // So if there has been a block commit error,
+        // we need to drop all the descendants of that block,
+        // until we receive a block at the required next height.
+        let next_valid_height = finalized_state
+            .db
+            .finalized_tip_height()
+            .map(|height| (height + 1).expect("committed heights are valid"))
+            .unwrap_or(Height(0));
+
+        if ordered_block.0.height != next_valid_height {
+            debug!(
+                ?next_valid_height,
+                invalid_height = ?ordered_block.0.height,
+                invalid_hash = ?ordered_block.0.hash,
+                "got a block that was the wrong height. \
+                 Assuming a parent block failed, and dropping this block",
+            );
+
+            // We don't want to send a reset here, because it could overwrite a valid sent hash
+            std::mem::drop(ordered_block);
+            continue;
+        }
+
+        // Try committing the block
+        match finalized_state.commit_finalized(ordered_block) {
+            Ok(finalized) => {
+                let tip_block = ChainTipBlock::from(finalized);
+
+                // TODO: update the chain tip sender with non-finalized blocks in this function,
+                //       and get rid of the mutex
+                chain_tip_sender
+                    .lock()
+                    .expect("unexpected panic in block commit task or state")
+                    .set_finalized_tip(tip_block);
+            }
+            Err(error) => {
+                let finalized_tip = finalized_state.db.tip();
+
+                // The last block in the queue failed, so we can't commit the next block.
+                // Instead, we need to reset the state queue,
+                // and discard any children of the invalid block in the channel.
+                info!(
+                    ?error,
+                    last_valid_height = ?finalized_tip.map(|tip| tip.0),
+                    last_valid_hash = ?finalized_tip.map(|tip| tip.1),
+                    "committing a block to the finalized state failed, resetting state queue",
+                );
+
+                let send_result =
+                    invalid_block_reset_sender.send(finalized_state.db.finalized_tip_hash());
+
+                if send_result.is_err() {
+                    info!("StateService closed the block reset channel. Is Zebra shutting down?");
+                    return;
+                }
+            }
+        }
+    }
+
+    // Do this check even if the channel got closed before any finalized blocks were sent.
+    // This can happen if we're past the finalized tip.
+    if invalid_block_reset_sender.is_closed() {
+        info!("StateService closed the block reset channel. Is Zebra shutting down?");
+        return;
+    }
+
+    // Write all the finalized blocks sent by the state, until Zebra shuts down.
+    while let Some(_block) = non_finalized_block_write_receiver.blocking_recv() {
+        if invalid_block_reset_sender.is_closed() {
+            info!("StateService closed the block reset channel. Is Zebra shutting down?");
+            return;
+        }
+
+        // TODO:
+        // - read from the channel
+        // - commit blocks to the non-finalized state
+        // - if there are any ready, commit blocks to the finalized state
+        // - handle errors by sending a reset with all the block hashes in the non-finalized state, and the finalized tip
+        // - update the chain tip sender and cached non-finalized state
+        error!("handle non-finalized block writes here");
+    }
+
+    // We're finished receiving non-finalized blocks from the state.
+    //
+    // TODO:
+    // - make the task an object, and do this in the drop impl?
+    // - does the drop order matter here?
+    non_finalized_block_write_receiver.close();
+    std::mem::drop(non_finalized_block_write_receiver);
+
+    // We're done writing to the finalized state, so we can force it to shut down.
+    finalized_state.db.shutdown(true);
+    std::mem::drop(finalized_state);
+}
--- a/zebrad/src/components/inbound/tests/fake_peer_set.rs
+++ b/zebrad/src/components/inbound/tests/fake_peer_set.rs
@ -10,13 +10,14 @@ use std::{
 };

 use futures::FutureExt;
-use tokio::{sync::oneshot, task::JoinHandle};
+use tokio::{sync::oneshot, task::JoinHandle, time::timeout};
 use tower::{buffer::Buffer, builder::ServiceBuilder, util::BoxService, Service, ServiceExt};
 use tracing::Span;

 use zebra_chain::{
    amount::Amount,
    block::Block,
+    fmt::humantime_seconds,
    parameters::Network::{self, *},
    serialization::ZcashDeserializeInto,
    transaction::{UnminedTx, UnminedTxId, VerifiedUnminedTx},
@ -24,7 +25,7 @@ use zebra_chain::{
 use zebra_consensus::{error::TransactionError, transaction, Config as ConsensusConfig};
 use zebra_network::{AddressBook, InventoryResponse, Request, Response};
 use zebra_node_services::mempool;
-use zebra_state::Config as StateConfig;
+use zebra_state::{ChainTipChange, Config as StateConfig, CHAIN_TIP_UPDATE_WAIT_LIMIT};
 use zebra_test::mock_service::{MockService, PanicAssertion};

 use crate::{
@ -59,6 +60,7 @@ async fn mempool_requests_for_transactions() {
        _mock_tx_verifier,
        mut peer_set,
        _state_guard,
+        _chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    ) = setup(true).await;
@ -142,6 +144,7 @@ async fn mempool_push_transaction() -> Result<(), crate::BoxError> {
        mut tx_verifier,
        mut peer_set,
        _state_guard,
+        _chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    ) = setup(false).await;
@ -236,6 +239,7 @@ async fn mempool_advertise_transaction_ids() -> Result<(), crate::BoxError> {
        mut tx_verifier,
        mut peer_set,
        _state_guard,
+        _chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    ) = setup(false).await;
@ -342,6 +346,7 @@ async fn mempool_transaction_expiration() -> Result<(), crate::BoxError> {
        mut tx_verifier,
        mut peer_set,
        state_service,
+        _chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    ) = setup(false).await;
@ -638,6 +643,7 @@ async fn inbound_block_height_lookahead_limit() -> Result<(), crate::BoxError> {
        mut tx_verifier,
        mut peer_set,
        state_service,
+        mut chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    ) = setup(false).await;
@ -658,7 +664,20 @@ async fn inbound_block_height_lookahead_limit() -> Result<(), crate::BoxError> {
        .await
        .respond(Response::Blocks(vec![Available(block)]));

-    // TODO: check that the block is queued in the checkpoint verifier
+    // Wait for the chain tip update
+    if let Err(timeout_error) = timeout(
+        CHAIN_TIP_UPDATE_WAIT_LIMIT,
+        chain_tip_change.wait_for_tip_change(),
+    )
+    .await
+    .map(|change_result| change_result.expect("unexpected chain tip update failure"))
+    {
+        info!(
+            timeout = ?humantime_seconds(CHAIN_TIP_UPDATE_WAIT_LIMIT),
+            ?timeout_error,
+            "timeout waiting for chain tip change after committing block"
+        );
+    }

    // check that nothing unexpected happened
    peer_set.expect_no_requests().await;
@ -729,6 +748,7 @@ async fn setup(
    MockService<transaction::Request, transaction::Response, PanicAssertion, TransactionError>,
    MockService<Request, Response, PanicAssertion>,
    Buffer<BoxService<zebra_state::Request, zebra_state::Response, BoxError>, zebra_state::Request>,
+    ChainTipChange,
    JoinHandle<Result<(), BlockGossipError>>,
    JoinHandle<Result<(), BoxError>>,
 ) {
@ -744,7 +764,7 @@ async fn setup(
    );
    let address_book = Arc::new(std::sync::Mutex::new(address_book));
    let (sync_status, mut recent_syncs) = SyncStatus::new();
-    let (state, _read_only_state_service, latest_chain_tip, chain_tip_change) =
+    let (state, _read_only_state_service, latest_chain_tip, mut chain_tip_change) =
        zebra_state::init(state_config.clone(), network);

    let mut state_service = ServiceBuilder::new().buffer(1).service(state);
@ -786,6 +806,21 @@ async fn setup(
        .unwrap();
    committed_blocks.push(genesis_block);

+    // Wait for the chain tip update
+    if let Err(timeout_error) = timeout(
+        CHAIN_TIP_UPDATE_WAIT_LIMIT,
+        chain_tip_change.wait_for_tip_change(),
+    )
+    .await
+    .map(|change_result| change_result.expect("unexpected chain tip update failure"))
+    {
+        info!(
+            timeout = ?humantime_seconds(CHAIN_TIP_UPDATE_WAIT_LIMIT),
+            ?timeout_error,
+            "timeout waiting for chain tip change after committing block"
+        );
+    }
+
    // Also push block 1.
    // Block one is a network upgrade and the mempool will be cleared at it,
    // let all our tests start after this event.
@ -801,6 +836,8 @@ async fn setup(
        .unwrap();
    committed_blocks.push(block_one);

+    // Don't wait for the chain tip update here, we wait for AdvertiseBlock below
+
    let (mut mempool_service, transaction_receiver) = Mempool::new(
        &MempoolConfig::default(),
        buffered_peer_set.clone(),
@ -845,7 +882,7 @@ async fn setup(

    let sync_gossip_task_handle = tokio::spawn(sync::gossip_best_tip_block_hashes(
        sync_status.clone(),
-        chain_tip_change,
+        chain_tip_change.clone(),
        peer_set.clone(),
    ));

@ -873,6 +910,7 @@ async fn setup(
        mock_tx_verifier,
        peer_set,
        state_service,
+        chain_tip_change,
        sync_gossip_task_handle,
        tx_gossip_task_handle,
    )
--- a/zebrad/src/components/mempool/tests/vector.rs
+++ b/zebrad/src/components/mempool/tests/vector.rs
@ -3,20 +3,23 @@
 use std::{collections::HashSet, sync::Arc};

 use color_eyre::Report;
-use tokio::time;
+use tokio::time::{self, timeout};
 use tower::{ServiceBuilder, ServiceExt};

-use zebra_chain::{block::Block, parameters::Network, serialization::ZcashDeserializeInto};
+use zebra_chain::{
+    block::Block, fmt::humantime_seconds, parameters::Network, serialization::ZcashDeserializeInto,
+};
 use zebra_consensus::transaction as tx;
-use zebra_state::Config as StateConfig;
+use zebra_state::{Config as StateConfig, CHAIN_TIP_UPDATE_WAIT_LIMIT};
 use zebra_test::mock_service::{MockService, PanicAssertion};

-use super::UnboxMempoolError;
 use crate::components::{
    mempool::{self, storage::tests::unmined_transactions_in_blocks, *},
    sync::RecentSyncLengths,
 };

+use super::UnboxMempoolError;
+
 /// A [`MockService`] representing the network service.
 type MockPeerSet = MockService<zn::Request, zn::Response, PanicAssertion>;

@ -51,7 +54,7 @@ async fn mempool_service_basic_single() -> Result<(), Report> {
    // inserted except one (the genesis block transaction).
    let cost_limit = more_transactions.iter().map(|tx| tx.cost()).sum();

-    let (mut service, _peer_set, _state_service, _tx_verifier, mut recent_syncs) =
+    let (mut service, _peer_set, _state_service, _chain_tip_change, _tx_verifier, mut recent_syncs) =
        setup(network, cost_limit).await;

    // Enable the mempool
@ -198,7 +201,7 @@ async fn mempool_queue_single() -> Result<(), Report> {
        .map(|tx| tx.cost())
        .sum();

-    let (mut service, _peer_set, _state_service, _tx_verifier, mut recent_syncs) =
+    let (mut service, _peer_set, _state_service, _chain_tip_change, _tx_verifier, mut recent_syncs) =
        setup(network, cost_limit).await;

    // Enable the mempool
@ -272,7 +275,7 @@ async fn mempool_service_disabled() -> Result<(), Report> {
    // Using the mainnet for now
    let network = Network::Mainnet;

-    let (mut service, _peer_set, _state_service, _tx_verifier, mut recent_syncs) =
+    let (mut service, _peer_set, _state_service, _chain_tip_change, _tx_verifier, mut recent_syncs) =
        setup(network, u64::MAX).await;

    // get the genesis block transactions from the Zcash blockchain.
@ -387,8 +390,14 @@ async fn mempool_cancel_mined() -> Result<(), Report> {
    // Using the mainnet for now
    let network = Network::Mainnet;

-    let (mut mempool, _peer_set, mut state_service, _tx_verifier, mut recent_syncs) =
-        setup(network, u64::MAX).await;
+    let (
+        mut mempool,
+        _peer_set,
+        mut state_service,
+        _chain_tip_change,
+        _tx_verifier,
+        mut recent_syncs,
+    ) = setup(network, u64::MAX).await;

    // Enable the mempool
    mempool.enable(&mut recent_syncs).await;
@ -480,8 +489,14 @@ async fn mempool_cancel_downloads_after_network_upgrade() -> Result<(), Report>
    // Using the mainnet for now
    let network = Network::Mainnet;

-    let (mut mempool, _peer_set, mut state_service, _tx_verifier, mut recent_syncs) =
-        setup(network, u64::MAX).await;
+    let (
+        mut mempool,
+        _peer_set,
+        mut state_service,
+        mut chain_tip_change,
+        _tx_verifier,
+        mut recent_syncs,
+    ) = setup(network, u64::MAX).await;

    // Enable the mempool
    mempool.enable(&mut recent_syncs).await;
@ -501,6 +516,21 @@ async fn mempool_cancel_downloads_after_network_upgrade() -> Result<(), Report>
        .await
        .unwrap();

+    // Wait for the chain tip update
+    if let Err(timeout_error) = timeout(
+        CHAIN_TIP_UPDATE_WAIT_LIMIT,
+        chain_tip_change.wait_for_tip_change(),
+    )
+    .await
+    .map(|change_result| change_result.expect("unexpected chain tip update failure"))
+    {
+        info!(
+            timeout = ?humantime_seconds(CHAIN_TIP_UPDATE_WAIT_LIMIT),
+            ?timeout_error,
+            "timeout waiting for chain tip change after committing block"
+        );
+    }
+
    // Queue transaction from block 2 for download
    let txid = block2.transactions[0].unmined_id();
    let response = mempool
@ -533,6 +563,21 @@ async fn mempool_cancel_downloads_after_network_upgrade() -> Result<(), Report>
        .await
        .unwrap();

+    // Wait for the chain tip update
+    if let Err(timeout_error) = timeout(
+        CHAIN_TIP_UPDATE_WAIT_LIMIT,
+        chain_tip_change.wait_for_tip_change(),
+    )
+    .await
+    .map(|change_result| change_result.expect("unexpected chain tip update failure"))
+    {
+        info!(
+            timeout = ?humantime_seconds(CHAIN_TIP_UPDATE_WAIT_LIMIT),
+            ?timeout_error,
+            "timeout waiting for chain tip change after committing block"
+        );
+    }
+
    // Query the mempool to make it poll chain_tip_change
    mempool.dummy_call().await;

@ -548,8 +593,14 @@ async fn mempool_failed_verification_is_rejected() -> Result<(), Report> {
    // Using the mainnet for now
    let network = Network::Mainnet;

-    let (mut mempool, _peer_set, _state_service, mut tx_verifier, mut recent_syncs) =
-        setup(network, u64::MAX).await;
+    let (
+        mut mempool,
+        _peer_set,
+        _state_service,
+        _chain_tip_change,
+        mut tx_verifier,
+        mut recent_syncs,
+    ) = setup(network, u64::MAX).await;

    // Get transactions to use in the test
    let mut unmined_transactions = unmined_transactions_in_blocks(1..=2, network);
@ -617,8 +668,14 @@ async fn mempool_failed_download_is_not_rejected() -> Result<(), Report> {
    // Using the mainnet for now
    let network = Network::Mainnet;

-    let (mut mempool, mut peer_set, _state_service, _tx_verifier, mut recent_syncs) =
-        setup(network, u64::MAX).await;
+    let (
+        mut mempool,
+        mut peer_set,
+        _state_service,
+        _chain_tip_change,
+        _tx_verifier,
+        mut recent_syncs,
+    ) = setup(network, u64::MAX).await;

    // Get transactions to use in the test
    let mut unmined_transactions = unmined_transactions_in_blocks(1..=2, network);
@ -688,6 +745,7 @@ async fn setup(
    Mempool,
    MockPeerSet,
    StateService,
+    ChainTipChange,
    MockTxVerifier,
    RecentSyncLengths,
 ) {
@ -712,8 +770,15 @@ async fn setup(
        Buffer::new(BoxService::new(tx_verifier.clone()), 1),
        sync_status,
        latest_chain_tip,
-        chain_tip_change,
+        chain_tip_change.clone(),
    );

-    (mempool, peer_set, state_service, tx_verifier, recent_syncs)
+    (
+        mempool,
+        peer_set,
+        state_service,
+        chain_tip_change,
+        tx_verifier,
+        recent_syncs,
+    )
 }
--- a/zebrad/tests/common/lightwalletd/send_transaction_test.rs
+++ b/zebrad/tests/common/lightwalletd/send_transaction_test.rs
@ -176,8 +176,13 @@ pub async fn run() -> Result<()> {
        assert_eq!(response, expected_response);
    }

-    tracing::info!("waiting for mempool to verify some transactions...");
-    zebrad.expect_stdout_line_matches("sending mempool transaction broadcast")?;
+    // The timing of verification logs are unrealiable, so we've disabled this check for now.
+    //
+    // TODO: when lightwalletd starts returning transactions again:
+    //       re-enable this check, find a better way to check, or delete this commented-out check
+    //
+    //tracing::info!("waiting for mempool to verify some transactions...");
+    //zebrad.expect_stdout_line_matches("sending mempool transaction broadcast")?;

    tracing::info!("calling GetMempoolTx gRPC to fetch transactions...");
    let mut transactions_stream = rpc_client