From 92634f788bae26d75b51071af3f578959bb4c587 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 20 Oct 2021 11:59:09 +1000 Subject: [PATCH] Increase Zebra's restart acceptance test timeout (#2910) * Increase the restart test timeout to 10 seconds It shouldn't take this long. But maybe the CI VMs are under a lot of load? * Add extensive logging to diagnose CI state reload failures --- zebra-state/src/service/finalized_state.rs | 30 +++++++++++++++++++--- zebrad/tests/acceptance.rs | 12 +++++++-- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/zebra-state/src/service/finalized_state.rs b/zebra-state/src/service/finalized_state.rs index 32232e472..93e254139 100644 --- a/zebra-state/src/service/finalized_state.rs +++ b/zebra-state/src/service/finalized_state.rs @@ -93,12 +93,17 @@ impl FinalizedState { network, }; + // TODO: remove these extra logs once bugs like #2905 are fixed + tracing::info!("reading cached tip height"); if let Some(tip_height) = new_state.finalized_tip_height() { + tracing::info!(?tip_height, "loaded cached tip height"); + if new_state.is_at_stop_height(tip_height) { let debug_stop_at_height = new_state .debug_stop_at_height .expect("true from `is_at_stop_height` implies `debug_stop_at_height` is Some"); + tracing::info!("reading cached tip hash"); let tip_hash = new_state.finalized_tip_hash(); if tip_height > debug_stop_at_height { @@ -119,11 +124,15 @@ impl FinalizedState { // RocksDB can do a cleanup when column families are opened. // So we want to drop it before we exit. + tracing::info!("closing cached state"); std::mem::drop(new_state); + tracing::info!("exiting Zebra"); std::process::exit(0); } } + tracing::info!(tip = ?new_state.tip(), "loaded Zebra state cache"); + new_state } @@ -450,12 +459,22 @@ impl FinalizedState { if result.is_ok() && self.is_at_stop_height(height) { tracing::info!(?source, "committed block from"); - tracing::info!(?height, ?hash, "stopping at configured height"); + tracing::info!( + ?height, + ?hash, + "stopping at configured height, flushing database to disk" + ); + // We'd like to drop the database here, because that closes the // column families and the database. But Rust's ownership rules // make that difficult, so we just flush instead. + + // TODO: remove these extra logs once bugs like #2905 are fixed self.db.flush().expect("flush is successful"); + tracing::info!("flushed database to disk"); + self.delete_ephemeral(); + tracing::info!("exiting Zebra"); std::process::exit(0); } @@ -632,7 +651,8 @@ impl FinalizedState { fn delete_ephemeral(&self) { if self.ephemeral { let path = self.db.path(); - tracing::debug!("removing temporary database files {:?}", path); + tracing::info!(cache_path = ?path, "removing temporary database files"); + // We'd like to use `rocksdb::Env::mem_env` for ephemeral databases, // but the Zcash blockchain might not fit in memory. So we just // delete the database files instead. @@ -645,7 +665,11 @@ impl FinalizedState { // delete them using standard filesystem APIs. Deleting open files // might cause errors on non-Unix platforms, so we ignore the result. // (The OS will delete them eventually anyway.) - let _res = std::fs::remove_dir_all(path); + let res = std::fs::remove_dir_all(path); + + // TODO: downgrade to debug once bugs like #2905 are fixed + // but leave any errors at "info" level + tracing::info!(?res, "removed temporary database files"); } } diff --git a/zebrad/tests/acceptance.rs b/zebrad/tests/acceptance.rs index 13a3d9a4c..2aeca17b5 100644 --- a/zebrad/tests/acceptance.rs +++ b/zebrad/tests/acceptance.rs @@ -697,9 +697,17 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height = const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height"; -const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5); -// usually it's much shorter than this +/// The maximum amount of time Zebra should take to reload after shutting down. +/// +/// This should only take a second, but sometimes CI VMs or RocksDB can be slow. +const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(10); + +/// The maximum amount of time Zebra should take to sync a few hundred blocks. +/// +/// Usually the small checkpoint is much shorter than this. const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(120); + +/// The maximum amount of time Zebra should take to sync a thousand blocks. const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180); /// Test if `zebrad` can sync the first checkpoint on mainnet.