fix(test): make full sync test more accurate (#3555)
* feat(log): log current height when logging sync progress * fix(test): log the specific error when full sync tests fail * doc(start): remove an obsolete TODO We can't decrease this log level, because the tests rely on it. * fix(test): wait until mempool activates in full sync tests Changes the log message and log test so that the test only finishes when the mempool has activated. There is still a race condition here, between the log timer and mempool activation. But it should be very rare, because the mempool is activated immediately when `is_close_to_tip()` becomes true. * fix(test): warn when Zebra stalls below the maximum checkpoint height This also improves the full sync tests, because the warning is checked before logging a successful sync. * feat(log): warn when sync stalls downloading the genesis block * fix(test): warn when the state hasn't committed a block for a long time This also improves the full sync tests, because the warning is checked before logging a successful sync. * doc(test): update some sync acceptance test comments * fix(log): use Display formatting to log chrono::Duration Debug formatting is complicated and hard to read. * fix(log): stop saying that we've activated the mempool without checking it We're not checking if the mempool is active, so we can't say that. * fix(log): minor tidying and TODOs * fix(doc): fix a typo in the tests * fix(log): explain the post-checkpoint blocks in progress warning calculations * fix(doc): explain what could happen if we don't wait for extra blocks * fix(log): add a percent symbol to a percent log Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com> Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com> Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
This commit is contained in:
parent
ea7253ee5b
commit
5c60528b84
|
@ -5734,6 +5734,7 @@ dependencies = [
|
|||
"lazy_static",
|
||||
"metrics",
|
||||
"metrics-exporter-prometheus",
|
||||
"num-integer",
|
||||
"once_cell",
|
||||
"pin-project 1.0.7",
|
||||
"proptest",
|
||||
|
|
|
@ -42,7 +42,7 @@ mod types;
|
|||
#[cfg(test)]
|
||||
mod tests;
|
||||
|
||||
pub(crate) use list::CheckpointList;
|
||||
pub use list::CheckpointList;
|
||||
use types::{Progress, Progress::*};
|
||||
use types::{TargetHeight, TargetHeight::*};
|
||||
|
||||
|
|
|
@ -53,7 +53,7 @@ const TESTNET_CHECKPOINTS: &str = include_str!("test-checkpoints.txt");
|
|||
/// This is actually a bijective map, but since it is read-only, we use a
|
||||
/// BTreeMap, and do the value uniqueness check on initialisation.
|
||||
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
|
||||
pub(crate) struct CheckpointList(BTreeMap<block::Height, block::Hash>);
|
||||
pub struct CheckpointList(BTreeMap<block::Height, block::Hash>);
|
||||
|
||||
impl FromStr for CheckpointList {
|
||||
type Err = BoxError;
|
||||
|
|
|
@ -48,7 +48,9 @@ pub mod chain;
|
|||
pub mod error;
|
||||
|
||||
pub use block::VerifyBlockError;
|
||||
pub use checkpoint::{VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP};
|
||||
pub use checkpoint::{
|
||||
CheckpointList, VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP,
|
||||
};
|
||||
pub use config::Config;
|
||||
pub use error::BlockError;
|
||||
pub use primitives::groth16;
|
||||
|
|
|
@ -48,6 +48,7 @@ atty = "0.2.14"
|
|||
sentry = { version = "0.23.0", default-features = false, features = ["backtrace", "contexts", "reqwest", "rustls"] }
|
||||
sentry-tracing = "0.23.0"
|
||||
|
||||
num-integer = "0.1.44"
|
||||
rand = "0.8.5"
|
||||
|
||||
[build-dependencies]
|
||||
|
|
|
@ -54,17 +54,23 @@
|
|||
//! * runs in the background and gossips newly added mempool transactions
|
||||
//! to peers
|
||||
|
||||
use std::{cmp::max, time::Duration};
|
||||
use std::{cmp::max, ops::Add, time::Duration};
|
||||
|
||||
use abscissa_core::{config, Command, FrameworkError, Options, Runnable};
|
||||
use chrono::Utc;
|
||||
use color_eyre::eyre::{eyre, Report};
|
||||
use futures::FutureExt;
|
||||
use num_integer::div_ceil;
|
||||
use tokio::{pin, select, sync::oneshot};
|
||||
use tower::{builder::ServiceBuilder, util::BoxService};
|
||||
use tracing_futures::Instrument;
|
||||
|
||||
use zebra_chain::{chain_tip::ChainTip, parameters::Network};
|
||||
use zebra_chain::{
|
||||
block::Height,
|
||||
chain_tip::ChainTip,
|
||||
parameters::{Network, NetworkUpgrade, POST_BLOSSOM_POW_TARGET_SPACING},
|
||||
};
|
||||
use zebra_consensus::CheckpointList;
|
||||
|
||||
use crate::{
|
||||
components::{
|
||||
|
@ -315,9 +321,71 @@ impl StartCmd {
|
|||
// TODO: replace with `MAX_CLOSE_TO_TIP_BLOCKS` after fixing slow syncing near tip (#3375)
|
||||
const MIN_SYNC_WARNING_BLOCKS: i32 = 60;
|
||||
|
||||
// The number of fractional digits in sync percentages.
|
||||
const SYNC_PERCENT_FRAC_DIGITS: usize = 3;
|
||||
|
||||
// The minimum number of extra blocks mined between updating a checkpoint list,
|
||||
// and running an automated test that depends on that list.
|
||||
//
|
||||
// Makes sure that the block finalization code always runs in sync tests,
|
||||
// even if the miner or test node clock is wrong by a few minutes.
|
||||
//
|
||||
// This is an estimate based on the time it takes to:
|
||||
// - get the tip height from `zcashd`,
|
||||
// - run `zebra-checkpoints` to update the checkpoint list,
|
||||
// - submit a pull request, and
|
||||
// - run a CI test that logs progress based on the new checkpoint height.
|
||||
//
|
||||
// We might add tests that sync from a cached tip state,
|
||||
// so we only allow a few extra blocks here.
|
||||
const MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE: i32 = 10;
|
||||
|
||||
// The minimum number of extra blocks after the highest checkpoint, based on:
|
||||
// - the non-finalized state limit, and
|
||||
// - the minimum number of extra blocks mined between a checkpoint update,
|
||||
// and the automated tests for that update.
|
||||
let min_after_checkpoint_blocks = i32::try_from(zebra_state::MAX_BLOCK_REORG_HEIGHT)
|
||||
.expect("constant fits in i32")
|
||||
+ MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE;
|
||||
|
||||
// The minimum height of the valid best chain, based on:
|
||||
// - the hard-coded checkpoint height,
|
||||
// - the minimum number of blocks after the highest checkpoint.
|
||||
let after_checkpoint_height = CheckpointList::new(network)
|
||||
.max_height()
|
||||
.add(min_after_checkpoint_blocks)
|
||||
.expect("hard-coded checkpoint height is far below Height::MAX");
|
||||
|
||||
let target_block_spacing = NetworkUpgrade::target_spacing_for_height(network, Height::MAX);
|
||||
let max_block_spacing =
|
||||
NetworkUpgrade::minimum_difficulty_spacing_for_height(network, Height::MAX);
|
||||
|
||||
// We expect the state height to increase at least once in this interval.
|
||||
//
|
||||
// Most chain forks are 1-7 blocks long.
|
||||
//
|
||||
// TODO: remove the target_block_spacing multiplier,
|
||||
// after fixing slow syncing near tip (#3375)
|
||||
let min_state_block_interval = max_block_spacing.unwrap_or(target_block_spacing * 4) * 2;
|
||||
|
||||
// Formatted string for logging.
|
||||
let max_block_spacing = max_block_spacing
|
||||
.map(|duration| duration.to_string())
|
||||
.unwrap_or_else(|| "None".to_string());
|
||||
|
||||
// The last time we downloaded and verified at least one block.
|
||||
//
|
||||
// Initialized to the start time to simplify the code.
|
||||
let mut last_state_change_time = Utc::now();
|
||||
|
||||
// The state tip height, when we last downloaded and verified at least one block.
|
||||
//
|
||||
// Initialized to the genesis height to simplify the code.
|
||||
let mut last_state_change_height = Height(0);
|
||||
|
||||
loop {
|
||||
let now = Utc::now();
|
||||
let is_close_to_tip = sync_status.is_close_to_tip();
|
||||
let is_syncer_stopped = sync_status.is_close_to_tip();
|
||||
|
||||
if let Some(estimated_height) =
|
||||
latest_chain_tip.estimate_network_chain_tip_height(network, now)
|
||||
|
@ -328,49 +396,133 @@ impl StartCmd {
|
|||
.best_tip_height()
|
||||
.expect("unexpected empty state: estimate requires a block height");
|
||||
|
||||
// Work out the sync progress towards the estimated tip.
|
||||
let sync_progress = f64::from(current_height.0) / f64::from(estimated_height.0);
|
||||
let sync_percent = format!("{:.3}", sync_progress * 100.0);
|
||||
let sync_percent = format!(
|
||||
"{:.frac$} %",
|
||||
sync_progress * 100.0,
|
||||
frac = SYNC_PERCENT_FRAC_DIGITS,
|
||||
);
|
||||
|
||||
let remaining_sync_blocks = estimated_height - current_height;
|
||||
|
||||
// TODO:
|
||||
// - estimate the remaining sync time
|
||||
// - log progress, remaining blocks, and remaining time to next network upgrade
|
||||
// - also add this info to the metrics
|
||||
// Work out how long it has been since the state height has increased.
|
||||
//
|
||||
// Non-finalized forks can decrease the height, we only want to track increases.
|
||||
if current_height > last_state_change_height {
|
||||
last_state_change_height = current_height;
|
||||
last_state_change_time = now;
|
||||
}
|
||||
|
||||
if is_close_to_tip && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS {
|
||||
let time_since_last_state_block = last_state_change_time.signed_duration_since(now);
|
||||
|
||||
// TODO:
|
||||
// - log progress, remaining blocks, and remaining time to next network upgrade
|
||||
// - add some of this info to the metrics
|
||||
|
||||
if time_since_last_state_block > min_state_block_interval {
|
||||
// The state tip height hasn't increased for a long time.
|
||||
//
|
||||
// Block verification can fail if the local node's clock is wrong.
|
||||
warn!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
%time_since_last_state_block,
|
||||
%target_block_spacing,
|
||||
%max_block_spacing,
|
||||
?is_syncer_stopped,
|
||||
"chain updates have stalled, \
|
||||
state height has not increased for {} minutes. \
|
||||
Hint: check your network connection, \
|
||||
and your computer clock and time zone",
|
||||
time_since_last_state_block.num_minutes(),
|
||||
);
|
||||
} else if is_syncer_stopped && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS {
|
||||
// We've stopped syncing blocks, but we estimate we're a long way from the tip.
|
||||
//
|
||||
// TODO: warn after fixing slow syncing near tip (#3375)
|
||||
info!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
?remaining_sync_blocks,
|
||||
"initial sync might have stalled, or estimated tip is wrong. \
|
||||
Hint: check that your computer's clock and time zone are correct"
|
||||
?after_checkpoint_height,
|
||||
%time_since_last_state_block,
|
||||
"initial sync is very slow, or estimated tip is wrong. \
|
||||
Hint: check your network connection, \
|
||||
and your computer clock and time zone",
|
||||
);
|
||||
} else if is_close_to_tip {
|
||||
} else if is_syncer_stopped && current_height <= after_checkpoint_height {
|
||||
// We've stopped syncing blocks,
|
||||
// but we're below the minimum height estimated from our checkpoints.
|
||||
let min_minutes_after_checkpoint_update: i64 = div_ceil(
|
||||
i64::from(MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE)
|
||||
* POST_BLOSSOM_POW_TARGET_SPACING,
|
||||
60,
|
||||
);
|
||||
|
||||
warn!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
?remaining_sync_blocks,
|
||||
?after_checkpoint_height,
|
||||
%time_since_last_state_block,
|
||||
"initial sync is very slow, and state is below the highest checkpoint. \
|
||||
Hint: check your network connection, \
|
||||
and your computer clock and time zone. \
|
||||
Dev Hint: were the checkpoints updated in the last {} minutes?",
|
||||
min_minutes_after_checkpoint_update,
|
||||
);
|
||||
} else if is_syncer_stopped {
|
||||
// We've stayed near the tip for a while, and we've stopped syncing lots of blocks.
|
||||
//
|
||||
// TODO: reduce this log level after full sync tests are merged,
|
||||
// and have been stable for a while (#1592)
|
||||
// So we're mostly using gossiped blocks now.
|
||||
info!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
?remaining_sync_blocks,
|
||||
"finished initial sync to chain tip"
|
||||
%time_since_last_state_block,
|
||||
"finished initial sync to chain tip, using gossiped blocks",
|
||||
);
|
||||
} else if remaining_sync_blocks <= MAX_CLOSE_TO_TIP_BLOCKS {
|
||||
// We estimate we're near the tip, but we have been syncing lots of blocks recently.
|
||||
// We might also be using some gossiped blocks.
|
||||
info!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
?remaining_sync_blocks,
|
||||
"finished initial sync to chain tip"
|
||||
%time_since_last_state_block,
|
||||
"close to finishing initial sync, \
|
||||
confirming using syncer and gossiped blocks",
|
||||
);
|
||||
} else {
|
||||
// We estimate we're far from the tip, and we've been syncing lots of blocks.
|
||||
info!(
|
||||
%sync_percent,
|
||||
?current_height,
|
||||
?remaining_sync_blocks,
|
||||
"estimated progress to chain tip"
|
||||
%time_since_last_state_block,
|
||||
"estimated progress to chain tip",
|
||||
);
|
||||
}
|
||||
} else {
|
||||
let sync_percent = format!("{:.frac$} %", 0.0f64, frac = SYNC_PERCENT_FRAC_DIGITS,);
|
||||
|
||||
if is_syncer_stopped {
|
||||
// We've stopped syncing blocks,
|
||||
// but we haven't downloaded and verified the genesis block.
|
||||
warn!(
|
||||
%sync_percent,
|
||||
current_height = %"None",
|
||||
"initial sync can't download and verify the genesis block. \
|
||||
Hint: check your network connection, \
|
||||
and your computer clock and time zone",
|
||||
);
|
||||
} else {
|
||||
// We're waiting for the genesis block to be committed to the state,
|
||||
// before we can estimate the best chain tip.
|
||||
info!(
|
||||
%sync_percent,
|
||||
current_height = %"None",
|
||||
"initial sync is waiting to download the genesis block",
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -739,9 +739,14 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
|
|||
|
||||
const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
|
||||
|
||||
/// The text that should be logged when synchronization finishes and reaches the estimated chain
|
||||
/// tip.
|
||||
const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip";
|
||||
/// The text that should be logged when the initial sync finishes at the estimated chain tip.
|
||||
///
|
||||
/// This message is only logged if:
|
||||
/// - we have reached the estimated chain tip,
|
||||
/// - we have synced all known checkpoints,
|
||||
/// - the syncer has stopped downloading lots of blocks, and
|
||||
/// - we are regularly downloading some blocks via the syncer or block gossip.
|
||||
const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip, using gossiped blocks";
|
||||
|
||||
/// The maximum amount of time Zebra should take to reload after shutting down.
|
||||
///
|
||||
|
@ -903,7 +908,7 @@ fn sync_large_checkpoints_mempool_mainnet() -> Result<()> {
|
|||
#[test]
|
||||
#[ignore]
|
||||
fn full_sync_mainnet() {
|
||||
assert!(full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").is_ok());
|
||||
full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").expect("unexpected test failure");
|
||||
}
|
||||
|
||||
/// Test if `zebrad` can fully sync the chain on testnet.
|
||||
|
@ -914,7 +919,7 @@ fn full_sync_mainnet() {
|
|||
#[test]
|
||||
#[ignore]
|
||||
fn full_sync_testnet() {
|
||||
assert!(full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").is_ok());
|
||||
full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").expect("unexpected test failure");
|
||||
}
|
||||
|
||||
/// Sync `network` until the chain tip is reached, or a timeout elapses.
|
||||
|
@ -939,6 +944,13 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu
|
|||
)
|
||||
.map(|_| ())
|
||||
} else {
|
||||
tracing::info!(
|
||||
?network,
|
||||
"skipped full sync test, \
|
||||
set the {:?} environmental variable to run the test",
|
||||
timeout_argument_name,
|
||||
);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
}
|
||||
|
@ -950,9 +962,8 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu
|
|||
/// If `check_legacy_chain` is true,
|
||||
/// make sure the logs contain the legacy chain check.
|
||||
///
|
||||
/// If `enable_mempool_at_height` is `Some(Height(_))`,
|
||||
/// configure `zebrad` to debug-enable the mempool at that height.
|
||||
/// Then check the logs for the mempool being enabled.
|
||||
/// Configure `zebrad` to debug-enable the mempool based on `mempool_behavior`,
|
||||
/// then check the logs for the expected `mempool_behavior`.
|
||||
///
|
||||
/// If `stop_regex` is encountered before the process exits, kills the
|
||||
/// process, and mark the test as successful, even if `height` has not
|
||||
|
@ -1115,6 +1126,10 @@ fn create_cached_database(network: Network) -> Result<()> {
|
|||
true,
|
||||
|test_child: &mut TestChild<PathBuf>| {
|
||||
// make sure pre-cached databases finish before the mandatory checkpoint
|
||||
//
|
||||
// TODO: this check passes even if we reach the mandatory checkpoint,
|
||||
// because we sync finalized state, then non-finalized state.
|
||||
// Instead, fail if we see "best non-finalized chain root" in the logs.
|
||||
test_child.expect_stdout_line_matches("CommitFinalized request")?;
|
||||
Ok(())
|
||||
},
|
||||
|
|
Loading…
Reference in New Issue