fix(test): make full sync test more accurate (#3555)

* feat(log): log current height when logging sync progress

* fix(test): log the specific error when full sync tests fail

* doc(start): remove an obsolete TODO

We can't decrease this log level, because the tests rely on it.

* fix(test): wait until mempool activates in full sync tests

Changes the log message and log test so that the test only finishes
when the mempool has activated.

There is still a race condition here, between the log timer and
mempool activation. But it should be very rare, because the mempool
is activated immediately when `is_close_to_tip()` becomes true.

* fix(test): warn when Zebra stalls below the maximum checkpoint height

This also improves the full sync tests,
because the warning is checked before logging a successful sync.

* feat(log): warn when sync stalls downloading the genesis block

* fix(test): warn when the state hasn't committed a block for a long time

This also improves the full sync tests,
because the warning is checked before logging a successful sync.

* doc(test): update some sync acceptance test comments

* fix(log): use Display formatting to log chrono::Duration

Debug formatting is complicated and hard to read.

* fix(log): stop saying that we've activated the mempool without checking it

We're not checking if the mempool is active, so we can't say that.

* fix(log): minor tidying and TODOs

* fix(doc): fix a typo in the tests

* fix(log): explain the post-checkpoint blocks in progress warning calculations

* fix(doc): explain what could happen if we don't wait for extra blocks

* fix(log): add a percent symbol to a percent log

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>

Co-authored-by: Janito Vaqueiro Ferreira Filho <janito.vff@gmail.com>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
This commit is contained in:
teor 2022-02-18 12:00:24 +10:00 committed by GitHub
parent ea7253ee5b
commit 5c60528b84
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 200 additions and 29 deletions

1
Cargo.lock generated
View File

@ -5734,6 +5734,7 @@ dependencies = [
"lazy_static",
"metrics",
"metrics-exporter-prometheus",
"num-integer",
"once_cell",
"pin-project 1.0.7",
"proptest",

View File

@ -42,7 +42,7 @@ mod types;
#[cfg(test)]
mod tests;
pub(crate) use list::CheckpointList;
pub use list::CheckpointList;
use types::{Progress, Progress::*};
use types::{TargetHeight, TargetHeight::*};

View File

@ -53,7 +53,7 @@ const TESTNET_CHECKPOINTS: &str = include_str!("test-checkpoints.txt");
/// This is actually a bijective map, but since it is read-only, we use a
/// BTreeMap, and do the value uniqueness check on initialisation.
#[derive(Clone, Debug, Eq, Hash, PartialEq)]
pub(crate) struct CheckpointList(BTreeMap<block::Height, block::Hash>);
pub struct CheckpointList(BTreeMap<block::Height, block::Hash>);
impl FromStr for CheckpointList {
type Err = BoxError;

View File

@ -48,7 +48,9 @@ pub mod chain;
pub mod error;
pub use block::VerifyBlockError;
pub use checkpoint::{VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP};
pub use checkpoint::{
CheckpointList, VerifyCheckpointError, MAX_CHECKPOINT_BYTE_COUNT, MAX_CHECKPOINT_HEIGHT_GAP,
};
pub use config::Config;
pub use error::BlockError;
pub use primitives::groth16;

View File

@ -48,6 +48,7 @@ atty = "0.2.14"
sentry = { version = "0.23.0", default-features = false, features = ["backtrace", "contexts", "reqwest", "rustls"] }
sentry-tracing = "0.23.0"
num-integer = "0.1.44"
rand = "0.8.5"
[build-dependencies]

View File

@ -54,17 +54,23 @@
//! * runs in the background and gossips newly added mempool transactions
//! to peers
use std::{cmp::max, time::Duration};
use std::{cmp::max, ops::Add, time::Duration};
use abscissa_core::{config, Command, FrameworkError, Options, Runnable};
use chrono::Utc;
use color_eyre::eyre::{eyre, Report};
use futures::FutureExt;
use num_integer::div_ceil;
use tokio::{pin, select, sync::oneshot};
use tower::{builder::ServiceBuilder, util::BoxService};
use tracing_futures::Instrument;
use zebra_chain::{chain_tip::ChainTip, parameters::Network};
use zebra_chain::{
block::Height,
chain_tip::ChainTip,
parameters::{Network, NetworkUpgrade, POST_BLOSSOM_POW_TARGET_SPACING},
};
use zebra_consensus::CheckpointList;
use crate::{
components::{
@ -315,9 +321,71 @@ impl StartCmd {
// TODO: replace with `MAX_CLOSE_TO_TIP_BLOCKS` after fixing slow syncing near tip (#3375)
const MIN_SYNC_WARNING_BLOCKS: i32 = 60;
// The number of fractional digits in sync percentages.
const SYNC_PERCENT_FRAC_DIGITS: usize = 3;
// The minimum number of extra blocks mined between updating a checkpoint list,
// and running an automated test that depends on that list.
//
// Makes sure that the block finalization code always runs in sync tests,
// even if the miner or test node clock is wrong by a few minutes.
//
// This is an estimate based on the time it takes to:
// - get the tip height from `zcashd`,
// - run `zebra-checkpoints` to update the checkpoint list,
// - submit a pull request, and
// - run a CI test that logs progress based on the new checkpoint height.
//
// We might add tests that sync from a cached tip state,
// so we only allow a few extra blocks here.
const MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE: i32 = 10;
// The minimum number of extra blocks after the highest checkpoint, based on:
// - the non-finalized state limit, and
// - the minimum number of extra blocks mined between a checkpoint update,
// and the automated tests for that update.
let min_after_checkpoint_blocks = i32::try_from(zebra_state::MAX_BLOCK_REORG_HEIGHT)
.expect("constant fits in i32")
+ MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE;
// The minimum height of the valid best chain, based on:
// - the hard-coded checkpoint height,
// - the minimum number of blocks after the highest checkpoint.
let after_checkpoint_height = CheckpointList::new(network)
.max_height()
.add(min_after_checkpoint_blocks)
.expect("hard-coded checkpoint height is far below Height::MAX");
let target_block_spacing = NetworkUpgrade::target_spacing_for_height(network, Height::MAX);
let max_block_spacing =
NetworkUpgrade::minimum_difficulty_spacing_for_height(network, Height::MAX);
// We expect the state height to increase at least once in this interval.
//
// Most chain forks are 1-7 blocks long.
//
// TODO: remove the target_block_spacing multiplier,
// after fixing slow syncing near tip (#3375)
let min_state_block_interval = max_block_spacing.unwrap_or(target_block_spacing * 4) * 2;
// Formatted string for logging.
let max_block_spacing = max_block_spacing
.map(|duration| duration.to_string())
.unwrap_or_else(|| "None".to_string());
// The last time we downloaded and verified at least one block.
//
// Initialized to the start time to simplify the code.
let mut last_state_change_time = Utc::now();
// The state tip height, when we last downloaded and verified at least one block.
//
// Initialized to the genesis height to simplify the code.
let mut last_state_change_height = Height(0);
loop {
let now = Utc::now();
let is_close_to_tip = sync_status.is_close_to_tip();
let is_syncer_stopped = sync_status.is_close_to_tip();
if let Some(estimated_height) =
latest_chain_tip.estimate_network_chain_tip_height(network, now)
@ -328,49 +396,133 @@ impl StartCmd {
.best_tip_height()
.expect("unexpected empty state: estimate requires a block height");
// Work out the sync progress towards the estimated tip.
let sync_progress = f64::from(current_height.0) / f64::from(estimated_height.0);
let sync_percent = format!("{:.3}", sync_progress * 100.0);
let sync_percent = format!(
"{:.frac$} %",
sync_progress * 100.0,
frac = SYNC_PERCENT_FRAC_DIGITS,
);
let remaining_sync_blocks = estimated_height - current_height;
// TODO:
// - estimate the remaining sync time
// - log progress, remaining blocks, and remaining time to next network upgrade
// - also add this info to the metrics
// Work out how long it has been since the state height has increased.
//
// Non-finalized forks can decrease the height, we only want to track increases.
if current_height > last_state_change_height {
last_state_change_height = current_height;
last_state_change_time = now;
}
if is_close_to_tip && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS {
let time_since_last_state_block = last_state_change_time.signed_duration_since(now);
// TODO:
// - log progress, remaining blocks, and remaining time to next network upgrade
// - add some of this info to the metrics
if time_since_last_state_block > min_state_block_interval {
// The state tip height hasn't increased for a long time.
//
// Block verification can fail if the local node's clock is wrong.
warn!(
%sync_percent,
?current_height,
%time_since_last_state_block,
%target_block_spacing,
%max_block_spacing,
?is_syncer_stopped,
"chain updates have stalled, \
state height has not increased for {} minutes. \
Hint: check your network connection, \
and your computer clock and time zone",
time_since_last_state_block.num_minutes(),
);
} else if is_syncer_stopped && remaining_sync_blocks > MIN_SYNC_WARNING_BLOCKS {
// We've stopped syncing blocks, but we estimate we're a long way from the tip.
//
// TODO: warn after fixing slow syncing near tip (#3375)
info!(
%sync_percent,
?current_height,
?remaining_sync_blocks,
"initial sync might have stalled, or estimated tip is wrong. \
Hint: check that your computer's clock and time zone are correct"
?after_checkpoint_height,
%time_since_last_state_block,
"initial sync is very slow, or estimated tip is wrong. \
Hint: check your network connection, \
and your computer clock and time zone",
);
} else if is_close_to_tip {
} else if is_syncer_stopped && current_height <= after_checkpoint_height {
// We've stopped syncing blocks,
// but we're below the minimum height estimated from our checkpoints.
let min_minutes_after_checkpoint_update: i64 = div_ceil(
i64::from(MIN_BLOCKS_MINED_AFTER_CHECKPOINT_UPDATE)
* POST_BLOSSOM_POW_TARGET_SPACING,
60,
);
warn!(
%sync_percent,
?current_height,
?remaining_sync_blocks,
?after_checkpoint_height,
%time_since_last_state_block,
"initial sync is very slow, and state is below the highest checkpoint. \
Hint: check your network connection, \
and your computer clock and time zone. \
Dev Hint: were the checkpoints updated in the last {} minutes?",
min_minutes_after_checkpoint_update,
);
} else if is_syncer_stopped {
// We've stayed near the tip for a while, and we've stopped syncing lots of blocks.
//
// TODO: reduce this log level after full sync tests are merged,
// and have been stable for a while (#1592)
// So we're mostly using gossiped blocks now.
info!(
%sync_percent,
?current_height,
?remaining_sync_blocks,
"finished initial sync to chain tip"
%time_since_last_state_block,
"finished initial sync to chain tip, using gossiped blocks",
);
} else if remaining_sync_blocks <= MAX_CLOSE_TO_TIP_BLOCKS {
// We estimate we're near the tip, but we have been syncing lots of blocks recently.
// We might also be using some gossiped blocks.
info!(
%sync_percent,
?current_height,
?remaining_sync_blocks,
"finished initial sync to chain tip"
%time_since_last_state_block,
"close to finishing initial sync, \
confirming using syncer and gossiped blocks",
);
} else {
// We estimate we're far from the tip, and we've been syncing lots of blocks.
info!(
%sync_percent,
?current_height,
?remaining_sync_blocks,
"estimated progress to chain tip"
%time_since_last_state_block,
"estimated progress to chain tip",
);
}
} else {
let sync_percent = format!("{:.frac$} %", 0.0f64, frac = SYNC_PERCENT_FRAC_DIGITS,);
if is_syncer_stopped {
// We've stopped syncing blocks,
// but we haven't downloaded and verified the genesis block.
warn!(
%sync_percent,
current_height = %"None",
"initial sync can't download and verify the genesis block. \
Hint: check your network connection, \
and your computer clock and time zone",
);
} else {
// We're waiting for the genesis block to be committed to the state,
// before we can estimate the best chain tip.
info!(
%sync_percent,
current_height = %"None",
"initial sync is waiting to download the genesis block",
);
}
}

View File

@ -739,9 +739,14 @@ const LARGE_CHECKPOINT_TEST_HEIGHT: Height =
const STOP_AT_HEIGHT_REGEX: &str = "stopping at configured height";
/// The text that should be logged when synchronization finishes and reaches the estimated chain
/// tip.
const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip";
/// The text that should be logged when the initial sync finishes at the estimated chain tip.
///
/// This message is only logged if:
/// - we have reached the estimated chain tip,
/// - we have synced all known checkpoints,
/// - the syncer has stopped downloading lots of blocks, and
/// - we are regularly downloading some blocks via the syncer or block gossip.
const SYNC_FINISHED_REGEX: &str = "finished initial sync to chain tip, using gossiped blocks";
/// The maximum amount of time Zebra should take to reload after shutting down.
///
@ -903,7 +908,7 @@ fn sync_large_checkpoints_mempool_mainnet() -> Result<()> {
#[test]
#[ignore]
fn full_sync_mainnet() {
assert!(full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").is_ok());
full_sync_test(Mainnet, "FULL_SYNC_MAINNET_TIMEOUT_MINUTES").expect("unexpected test failure");
}
/// Test if `zebrad` can fully sync the chain on testnet.
@ -914,7 +919,7 @@ fn full_sync_mainnet() {
#[test]
#[ignore]
fn full_sync_testnet() {
assert!(full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").is_ok());
full_sync_test(Testnet, "FULL_SYNC_TESTNET_TIMEOUT_MINUTES").expect("unexpected test failure");
}
/// Sync `network` until the chain tip is reached, or a timeout elapses.
@ -939,6 +944,13 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu
)
.map(|_| ())
} else {
tracing::info!(
?network,
"skipped full sync test, \
set the {:?} environmental variable to run the test",
timeout_argument_name,
);
Ok(())
}
}
@ -950,9 +962,8 @@ fn full_sync_test(network: Network, timeout_argument_name: &'static str) -> Resu
/// If `check_legacy_chain` is true,
/// make sure the logs contain the legacy chain check.
///
/// If `enable_mempool_at_height` is `Some(Height(_))`,
/// configure `zebrad` to debug-enable the mempool at that height.
/// Then check the logs for the mempool being enabled.
/// Configure `zebrad` to debug-enable the mempool based on `mempool_behavior`,
/// then check the logs for the expected `mempool_behavior`.
///
/// If `stop_regex` is encountered before the process exits, kills the
/// process, and mark the test as successful, even if `height` has not
@ -1115,6 +1126,10 @@ fn create_cached_database(network: Network) -> Result<()> {
true,
|test_child: &mut TestChild<PathBuf>| {
// make sure pre-cached databases finish before the mandatory checkpoint
//
// TODO: this check passes even if we reach the mandatory checkpoint,
// because we sync finalized state, then non-finalized state.
// Instead, fail if we see "best non-finalized chain root" in the logs.
test_child.expect_stdout_line_matches("CommitFinalized request")?;
Ok(())
},