diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs index 66b620d55..34a7a999a 100644 --- a/zebra-state/src/service/finalized_state.rs +++ b/zebra-state/src/service/finalized_state.rs @@ -500,16 +500,9 @@ impl FinalizedState { "stopping at configured height, flushing database to disk" ); - // We'd like to drop the database here, because that closes the - // column families and the database. But Rust's ownership rules - // make that difficult, so we just flush and delete ephemeral data instead. - - // TODO: remove these extra logs once bugs like #2905 are fixed - self.db.flush().expect("flush is successful"); - tracing::info!("flushed database to disk"); - - self.delete_ephemeral(); + self.shutdown(); + // TODO: replace with a graceful shutdown (#1678) Self::exit_process(); } @@ -519,6 +512,7 @@ impl FinalizedState { /// Exit the host process. /// /// Designed for debugging and tests. + /// TODO: replace with a graceful shutdown (#1678) fn exit_process() -> ! { tracing::info!("exiting Zebra"); @@ -830,15 +824,59 @@ impl FinalizedState { self.db.write(batch).unwrap(); } + + /// Shut down the database, cleaning up background tasks and ephemeral data. + fn shutdown(&mut self) { + // Drop isn't guaranteed to run, such as when we panic, or if the tokio shutdown times out. + // + // Zebra's data should be fine if we don't clean up, because: + // - the database flushes regularly anyway + // - Zebra commits each block in a database transaction, any incomplete blocks get rolled back + // - ephemeral files are placed in the os temp dir and should be cleaned up automatically eventually + tracing::info!("flushing database to disk"); + self.db.flush().expect("flush is successful"); + + // But we should call `cancel_all_background_work` before Zebra exits. + // If we don't, we see these kinds of errors: + // ``` + // pthread lock: Invalid argument + // pure virtual method called + // terminate called without an active exception + // pthread destroy mutex: Device or resource busy + // Aborted (core dumped) + // ``` + // + // The RocksDB wiki says: + // > Q: Is it safe to close RocksDB while another thread is issuing read, write or manual compaction requests? + // > + // > A: No. The users of RocksDB need to make sure all functions have finished before they close RocksDB. + // > You can speed up the waiting by calling CancelAllBackgroundWork(). + // + // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ + tracing::info!("stopping background database tasks"); + self.db.cancel_all_background_work(true); + + // We'd like to drop the database before deleting its files, + // because that closes the column families and the database correctly. + // But Rust's ownership rules make that difficult, + // so we just flush and delete ephemeral data instead. + // + // The RocksDB wiki says: + // > rocksdb::DB instances need to be destroyed before your main function exits. + // > RocksDB instances usually depend on some internal static variables. + // > Users need to make sure rocksdb::DB instances are destroyed before those static variables. + // + // https://github.com/facebook/rocksdb/wiki/Known-Issues + // + // But our current code doesn't seem to cause any issues. + // We might want to explicitly drop the database as part of graceful shutdown (#1678). + self.delete_ephemeral(); + } } -// Drop isn't guaranteed to run, such as when we panic, or if someone stored -// their FinalizedState in a static, but it should be fine if we don't clean -// this up since the files are placed in the os temp dir and should be cleaned -// up automatically eventually. impl Drop for FinalizedState { fn drop(&mut self) { - self.delete_ephemeral() + self.shutdown(); } } diff --git a/zebra-test/scripts/shutdown-errors b/zebra-test/scripts/shutdown-errors new file mode 100755 index 000000000..98cab0235 --- /dev/null +++ b/zebra-test/scripts/shutdown-errors @@ -0,0 +1,63 @@ +#!/usr/bin/env bash + +# test that Zebra can shut down without errors + +# print every command run by this script +#set -x + +# initial shutdown delay +# interesting delays are between 1-30 seconds on most machines +SHUTDOWN_DELAY=0 + +# each test increases the shutdown delay by this much +SHUTDOWN_DELAY_INCREMENT=1 + +# final shutdown delay +# the test stops after a successful run with a delay this long +SHUTDOWN_DELAY_LIMIT=30 + +echo "Building Zebra" +echo + +cargo build --bin zebrad || exit $? + +EXIT_STATUS=0 +while [ $EXIT_STATUS -eq 0 ] && [ $SHUTDOWN_DELAY -le $SHUTDOWN_DELAY_LIMIT ]; do + # remove previously downloaded Zcash parameter files + # + # if you don't have these downloaded already, the killed downloads will be incomplete, + # which causes an error in Zebra + # rm -r ~/.zcash-params + + echo + echo "Running Zebra for $SHUTDOWN_DELAY seconds" + echo + + # shut down Zebra if this script exits while it's running, + # but ignore "no such job" errors if Zebra has already exited + trap "kill %?zebrad 2> /dev/null" EXIT + + target/debug/zebrad start & + sleep $SHUTDOWN_DELAY + + echo + echo "Killing Zebra after $SHUTDOWN_DELAY seconds" + echo + + kill %?zebrad + wait %?zebrad + EXIT_STATUS=$? + + # fix up the exit status caused by 'kill' + if [ $EXIT_STATUS -eq 143 ]; then + EXIT_STATUS=0 + fi + + echo + echo "Killing Zebra after $SHUTDOWN_DELAY seconds exited with $EXIT_STATUS" + echo + + SHUTDOWN_DELAY=$[SHUTDOWN_DELAY + SHUTDOWN_DELAY_INCREMENT] +done + +exit $EXIT_STATUS diff --git a/zebrad/src/components/tokio.rs b/zebrad/src/components/tokio.rs index d36e14401..09f2e805a 100644 --- a/zebrad/src/components/tokio.rs +++ b/zebrad/src/components/tokio.rs @@ -3,9 +3,12 @@ use crate::prelude::*; use abscissa_core::{Application, Component, FrameworkError, Shutdown}; use color_eyre::Report; -use std::future::Future; +use std::{future::Future, time::Duration}; use tokio::runtime::Runtime; +/// When Zebra is shutting down, wait this long for tokio tasks to finish. +const TOKIO_SHUTDOWN_TIMEOUT: Duration = Duration::from_secs(20); + /// An Abscissa component which owns a Tokio runtime. /// /// The runtime is stored as an `Option` so that when it's time to enter an async @@ -40,11 +43,11 @@ async fn shutdown() { /// Extension trait to centralize entry point for runnable subcommands that /// depend on tokio pub(crate) trait RuntimeRun { - fn run(&mut self, fut: impl Future>); + fn run(self, fut: impl Future>); } impl RuntimeRun for Runtime { - fn run(&mut self, fut: impl Future>) { + fn run(self, fut: impl Future>) { let result = self.block_on(async move { // Always poll the shutdown future first. // @@ -57,12 +60,16 @@ impl RuntimeRun for Runtime { } }); + // Don't wait for long blocking tasks before shutting down + tracing::info!( + ?TOKIO_SHUTDOWN_TIMEOUT, + "waiting for async tokio tasks to shut down" + ); + self.shutdown_timeout(TOKIO_SHUTDOWN_TIMEOUT); + match result { Ok(()) => { info!("shutting down Zebra"); - - // Don't wait for the runtime to shut down all the tasks. - app_writer().shutdown(Shutdown::Graceful); } Err(error) => { warn!(?error, "shutting down Zebra due to an error");