Increase Zebra's restart acceptance test timeout (#2910)

* Increase the restart test timeout to 10 seconds It shouldn't take this long. But maybe the CI VMs are under a lot of load? * Add extensive logging to diagnose CI state reload failures
2021-10-20 11:59:09 +10:00 · 2021-10-20 11:59:09 +10:00 · 92634f788b
parent 84f2c07fbc
commit 92634f788b
2 changed files with 37 additions and 5 deletions
--- a/zebra-state/src/service/finalized_state.rs
+++ b/zebra-state/src/service/finalized_state.rs
@ -93,12 +93,17 @@ impl FinalizedState {
            network,
        };

+        // TODO: remove these extra logs once bugs like #2905 are fixed
+        tracing::info!("reading cached tip height");
        if let Some(tip_height) = new_state.finalized_tip_height() {
+            tracing::info!(?tip_height, "loaded cached tip height");
+
            if new_state.is_at_stop_height(tip_height) {
                let debug_stop_at_height = new_state
                    .debug_stop_at_height
                    .expect("true from `is_at_stop_height` implies `debug_stop_at_height` is Some");

+                tracing::info!("reading cached tip hash");
                let tip_hash = new_state.finalized_tip_hash();

                if tip_height > debug_stop_at_height {
@ -119,11 +124,15 @@ impl FinalizedState {

                // RocksDB can do a cleanup when column families are opened.
                // So we want to drop it before we exit.
+                tracing::info!("closing cached state");
                std::mem::drop(new_state);
+                tracing::info!("exiting Zebra");
                std::process::exit(0);
            }
        }

+        tracing::info!(tip = ?new_state.tip(), "loaded Zebra state cache");
+
        new_state
    }

@ -450,12 +459,22 @@ impl FinalizedState {

        if result.is_ok() && self.is_at_stop_height(height) {
            tracing::info!(?source, "committed block from");
-            tracing::info!(?height, ?hash, "stopping at configured height");
+            tracing::info!(
+                ?height,
+                ?hash,
+                "stopping at configured height, flushing database to disk"
+            );
+
            // We'd like to drop the database here, because that closes the
            // column families and the database. But Rust's ownership rules
            // make that difficult, so we just flush instead.
+
+            // TODO: remove these extra logs once bugs like #2905 are fixed
            self.db.flush().expect("flush is successful");
+            tracing::info!("flushed database to disk");
+
            self.delete_ephemeral();
+            tracing::info!("exiting Zebra");
            std::process::exit(0);
        }

@ -632,7 +651,8 @@ impl FinalizedState {
    fn delete_ephemeral(&self) {
        if self.ephemeral {
            let path = self.db.path();
-            tracing::debug!("removing temporary database files {:?}", path);
+            tracing::info!(cache_path = ?path, "removing temporary database files");
+
            // We'd like to use `rocksdb::Env::mem_env` for ephemeral databases,
            // but the Zcash blockchain might not fit in memory. So we just
            // delete the database files instead.
@ -645,7 +665,11 @@ impl FinalizedState {
            // delete them using standard filesystem APIs. Deleting open files
            // might cause errors on non-Unix platforms, so we ignore the result.
            // (The OS will delete them eventually anyway.)
-            let _res = std::fs::remove_dir_all(path);
+            let res = std::fs::remove_dir_all(path);
+
+            // TODO: downgrade to debug once bugs like #2905 are fixed
+            //       but leave any errors at "info" level
+            tracing::info!(?res, "removed temporary database files");
        }
    }

--- a/zebrad/tests/acceptance.rs
+++ b/zebrad/tests/acceptance.rs
@ -697,9 +697,17 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height =

 const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";

-const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5);
-// usually it's much shorter than this
+/// The maximum amount of time Zebra should take to reload after shutting down.
+///
+/// This should only take a second, but sometimes CI VMs or RocksDB can be slow.
+const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The maximum amount of time Zebra should take to sync a few hundred blocks.
+///
+/// Usually the small checkpoint is much shorter than this.
 const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(120);
+
+/// The maximum amount of time Zebra should take to sync a thousand blocks.
 const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180);

 /// Test if `zebrad` can sync the first checkpoint on mainnet.