Increase Zebra's restart acceptance test timeout (#2910)

* Increase the restart test timeout to 10 seconds

It shouldn't take this long.
But maybe the CI VMs are under a lot of load?

* Add extensive logging to diagnose CI state reload failures
This commit is contained in:
teor 2021-10-20 11:59:09 +10:00 committed by GitHub
parent 84f2c07fbc
commit 92634f788b
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 5 deletions

View File

@ -93,12 +93,17 @@ impl FinalizedState {
network,
};
// TODO: remove these extra logs once bugs like #2905 are fixed
tracing::info!("reading cached tip height");
if let Some(tip_height) = new_state.finalized_tip_height() {
tracing::info!(?tip_height, "loaded cached tip height");
if new_state.is_at_stop_height(tip_height) {
let debug_stop_at_height = new_state
.debug_stop_at_height
.expect("true from `is_at_stop_height` implies `debug_stop_at_height` is Some");
tracing::info!("reading cached tip hash");
let tip_hash = new_state.finalized_tip_hash();
if tip_height > debug_stop_at_height {
@ -119,11 +124,15 @@ impl FinalizedState {
// RocksDB can do a cleanup when column families are opened.
// So we want to drop it before we exit.
tracing::info!("closing cached state");
std::mem::drop(new_state);
tracing::info!("exiting Zebra");
std::process::exit(0);
}
}
tracing::info!(tip = ?new_state.tip(), "loaded Zebra state cache");
new_state
}
@ -450,12 +459,22 @@ impl FinalizedState {
if result.is_ok() && self.is_at_stop_height(height) {
tracing::info!(?source, "committed block from");
tracing::info!(?height, ?hash, "stopping at configured height");
tracing::info!(
?height,
?hash,
"stopping at configured height, flushing database to disk"
);
// We'd like to drop the database here, because that closes the
// column families and the database. But Rust's ownership rules
// make that difficult, so we just flush instead.
// TODO: remove these extra logs once bugs like #2905 are fixed
self.db.flush().expect("flush is successful");
tracing::info!("flushed database to disk");
self.delete_ephemeral();
tracing::info!("exiting Zebra");
std::process::exit(0);
}
@ -632,7 +651,8 @@ impl FinalizedState {
fn delete_ephemeral(&self) {
if self.ephemeral {
let path = self.db.path();
tracing::debug!("removing temporary database files {:?}", path);
tracing::info!(cache_path = ?path, "removing temporary database files");
// We'd like to use `rocksdb::Env::mem_env` for ephemeral databases,
// but the Zcash blockchain might not fit in memory. So we just
// delete the database files instead.
@ -645,7 +665,11 @@ impl FinalizedState {
// delete them using standard filesystem APIs. Deleting open files
// might cause errors on non-Unix platforms, so we ignore the result.
// (The OS will delete them eventually anyway.)
let _res = std::fs::remove_dir_all(path);
let res = std::fs::remove_dir_all(path);
// TODO: downgrade to debug once bugs like #2905 are fixed
// but leave any errors at "info" level
tracing::info!(?res, "removed temporary database files");
}
}

View File

@ -697,9 +697,17 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(5);
// usually it's much shorter than this
/// The maximum amount of time Zebra should take to reload after shutting down.
///
/// This should only take a second, but sometimes CI VMs or RocksDB can be slow.
const STOP_ON_LOAD_TIMEOUT: Duration = Duration::from_secs(10);
/// The maximum amount of time Zebra should take to sync a few hundred blocks.
///
/// Usually the small checkpoint is much shorter than this.
const SMALL_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(120);
/// The maximum amount of time Zebra should take to sync a thousand blocks.
const LARGE_CHECKPOINT_TIMEOUT: Duration = Duration::from_secs(180);
/// Test if `zebrad` can sync the first checkpoint on mainnet.