From 92634f788bae26d75b51071af3f578959bb4c587 Mon Sep 17 00:00:00 2001
From: teor <teor@riseup.net>
Date: Wed, 20 Oct 2021 11:59:09 +1000
Subject: [PATCH] Increase Zebra's restart acceptance test timeout (#2910)

* Increase the restart test timeout to 10 seconds

It shouldn't take this long.
But maybe the CI VMs are under a lot of load?

* Add extensive logging to diagnose CI state reload failures
---
 zebra-state/src/service/finalized_state.rs | 30 +++++++++++++++++++---
 zebrad/tests/acceptance.rs                 | 12 +++++++--
 2 files changed, 37 insertions(+), 5 deletions(-)

diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs
index 32232e472..93e254139 100644
--- a/zebra-state/src/service/finalized_state.rs
+++ b/zebra-state/src/service/finalized_state.rs
@@ -93,12 +93,17 @@ impl FinalizedState {
             network,
         };
 
+        // TODO: remove these extra logs once bugs like #2905 are fixed
+        tracing::info!("reading cached tip height");
         if let Some(tip_height) = new_state.finalized_tip_height() {
+            tracing::info!(?tip_height, "loaded cached tip height");
+
             if new_state.is_at_stop_height(tip_height) {
                 let debug_stop_at_height = new_state
                     .debug_stop_at_height
                     .expect("true from `is_at_stop_height` implies `debug_stop_at_height` is Some");
 
+                tracing::info!("reading cached tip hash");
                 let tip_hash = new_state.finalized_tip_hash();
 
                 if tip_height > debug_stop_at_height {
@@ -119,11 +124,15 @@ impl FinalizedState {
 
                 // RocksDB can do a cleanup when column families are opened.
                 // So we want to drop it before we exit.
+                tracing::info!("closing cached state");
                 std::mem::drop(new_state);
+                tracing::info!("exiting Zebra");
                 std::process::exit(0);
             }
         }
 
+        tracing::info!(tip = ?new_state.tip(), "loaded Zebra state cache");
+
         new_state
     }
 
@@ -450,12 +459,22 @@ impl FinalizedState {
 
         if result.is_ok() && self.is_at_stop_height(height) {
             tracing::info!(?source, "committed block from");
-            tracing::info!(?height, ?hash, "stopping at configured height");
+            tracing::info!(
+                ?height,
+                ?hash,
+                "stopping at configured height, flushing database to disk"
+            );
+
             // We'd like to drop the database here, because that closes the
             // column families and the database. But Rust's ownership rules
             // make that difficult, so we just flush instead.
+
+            // TODO: remove these extra logs once bugs like #2905 are fixed
             self.db.flush().expect("flush is successful");
+            tracing::info!("flushed database to disk");
+
             self.delete_ephemeral();
+            tracing::info!("exiting Zebra");
             std::process::exit(0);
         }
 
@@ -632,7 +651,8 @@ impl FinalizedState {
     fn delete_ephemeral(&self) {
         if self.ephemeral {
             let path = self.db.path();
-            tracing::debug!("removing temporary database files {:?}", path);
+            tracing::info!(cache_path = ?path, "removing temporary database files");
+
             // We'd like to use `rocksdb::Env::mem_env` for ephemeral databases,
             // but the Zcash blockchain might not fit in memory. So we just
             // delete the database files instead.
@@ -645,7 +665,11 @@ impl FinalizedState {
             // delete them using standard filesystem APIs. Deleting open files
             // might cause errors on non-Unix platforms, so we ignore the result.
             // (The OS will delete them eventually anyway.)
-            let _res = std::fs::remove_dir_all(path);
+            let res = std::fs::remove_dir_all(path);
+
+            // TODO: downgrade to debug once bugs like #2905 are fixed
+            //       but leave any errors at "info" level
+            tracing::info!(?res, "removed temporary database files");
         }
     }
 
diff --git a/zebrad/tests/acceptance.rs b/zebrad/tests/acceptance.rs
index 13a3d9a4c..2aeca17b5 100644
--- a/zebrad/tests/acceptance.rs
+++ b/zebrad/tests/acceptance.rs
@@ -697,9 +697,17 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
 
 const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
 
-const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5);
-// usually it's much shorter than this
+/// The maximum amount of time Zebra should take to reload after shutting down.
+///
+/// This should only take a second, but sometimes CI VMs or RocksDB can be slow.
+const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(10);
+
+/// The maximum amount of time Zebra should take to sync a few hundred blocks.
+///
+/// Usually the small checkpoint is much shorter than this.
 const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(120);
+
+/// The maximum amount of time Zebra should take to sync a thousand blocks.
 const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180);
 
 /// Test if `zebrad` can sync the first checkpoint on mainnet.