From 84f2c07fbc548f925a0c5ef16648940c3be9a168 Mon Sep 17 00:00:00 2001 From: Conrado Gouvea Date: Tue, 19 Oct 2021 22:07:19 -0300 Subject: [PATCH] Ignore AlreadyInChain error in the syncer (#2890) * Ignore AlreadyInChain error in the syncer * Split Cancelled errors; add them to should_restart_sync exceptions * Also filter 'block is already comitted'; try to detect a wrong downcast --- zebra-consensus/src/block.rs | 1 + zebra-consensus/src/checkpoint.rs | 1 + zebra-consensus/src/lib.rs | 3 + zebrad/src/components/sync.rs | 75 ++++++++++++++++++++++--- zebrad/src/components/sync/downloads.rs | 45 ++++++++++++--- 5 files changed, 109 insertions(+), 16 deletions(-) diff --git a/zebra-consensus/src/block.rs b/zebra-consensus/src/block.rs index 8db7564df..dfda62165 100644 --- a/zebra-consensus/src/block.rs +++ b/zebra-consensus/src/block.rs @@ -48,6 +48,7 @@ pub struct BlockVerifier { // TODO: dedupe with crate::error::BlockError #[non_exhaustive] +#[allow(missing_docs)] #[derive(Debug, Error)] pub enum VerifyBlockError { #[error("unable to verify depth for block {hash} from chain state during block verification")] diff --git a/zebra-consensus/src/checkpoint.rs b/zebra-consensus/src/checkpoint.rs index 965b66813..b83c03467 100644 --- a/zebra-consensus/src/checkpoint.rs +++ b/zebra-consensus/src/checkpoint.rs @@ -836,6 +836,7 @@ where } #[derive(Debug, Error)] +#[allow(missing_docs)] pub enum VerifyCheckpointError { #[error("checkpoint request after the final checkpoint has been verified")] Finished, diff --git a/zebra-consensus/src/lib.rs b/zebra-consensus/src/lib.rs index ca55b0bd9..56e434931 100644 --- a/zebra-consensus/src/lib.rs +++ b/zebra-consensus/src/lib.rs @@ -53,9 +53,12 @@ pub mod chain; #[allow(missing_docs)] pub mod error; +pub use block::VerifyBlockError; +pub use checkpoint::VerifyCheckpointError; pub use checkpoint::MAX_CHECKPOINT_BYTE_COUNT; pub use checkpoint::MAX_CHECKPOINT_HEIGHT_GAP; pub use config::Config; +pub use error::BlockError; /// A boxed [`std::error::Error`]. pub type BoxError = Box; diff --git a/zebrad/src/components/sync.rs b/zebrad/src/components/sync.rs index 018e1a71f..66185510e 100644 --- a/zebrad/src/components/sync.rs +++ b/zebrad/src/components/sync.rs @@ -15,10 +15,15 @@ use zebra_chain::{ block::{self, Block}, parameters::genesis_hash, }; +use zebra_consensus::{ + chain::VerifyChainError, BlockError, VerifyBlockError, VerifyCheckpointError, +}; use zebra_network as zn; use zebra_state as zs; -use crate::{config::ZebradConfig, BoxError}; +use crate::{ + components::sync::downloads::BlockDownloadVerifyError, config::ZebradConfig, BoxError, +}; mod downloads; mod gossip; @@ -315,10 +320,7 @@ where tracing::trace!(?hash, "verified and committed block to state"); } Err(e) => { - if format!("{:?}", e).contains("AlreadyVerified {") { - tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); - } else { - tracing::warn!(?e, "error downloading and verifying block"); + if Self::should_restart_sync(e) { continue 'sync; } } @@ -352,10 +354,7 @@ where } Err(e) => { - if format!("{:?}", e).contains("AlreadyVerified {") { - tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); - } else { - tracing::warn!(?e, "error downloading and verifying block"); + if Self::should_restart_sync(e) { continue 'sync; } } @@ -715,4 +714,62 @@ where self.downloads.in_flight() as f64 ); } + + /// Return if the sync should be restarted based on the given error + /// from the block downloader and verifier stream. + fn should_restart_sync(e: BlockDownloadVerifyError) -> bool { + match e { + BlockDownloadVerifyError::Invalid(VerifyChainError::Checkpoint( + VerifyCheckpointError::AlreadyVerified { .. }, + )) => { + tracing::debug!(error = ?e, "block was already verified, possibly from a previous sync run, continuing"); + false + } + BlockDownloadVerifyError::Invalid(VerifyChainError::Block( + VerifyBlockError::Block { + source: BlockError::AlreadyInChain(_, _), + }, + )) => { + tracing::debug!(error = ?e, "block is already in chain, possibly from a previous sync run, continuing"); + false + } + BlockDownloadVerifyError::Invalid(VerifyChainError::Block( + VerifyBlockError::Commit(ref source), + )) if format!("{:?}", source).contains("block is already committed to the state") => { + // TODO: improve this by checking the type + // https://github.com/ZcashFoundation/zebra/issues/2908 + tracing::debug!(error = ?e, "block is already committed, possibly from a previous sync run, continuing"); + false + } + BlockDownloadVerifyError::CancelledDuringDownload + | BlockDownloadVerifyError::CancelledDuringVerification => { + tracing::debug!(error = ?e, "block verification was cancelled, continuing"); + false + } + _ => { + // download_and_verify downcasts errors from the block verifier + // into VerifyChainError, and puts the result inside one of the + // BlockDownloadVerifyError enumerations. This downcast could + // become incorrect e.g. after some refactoring, and it is difficult + // to write a test to check it. The test below is a best-effort + // attempt to catch if that happens and log it. + // TODO: add a proper test and remove this + // https://github.com/ZcashFoundation/zebra/issues/2909 + let err_str = format!("{:?}", e); + if err_str.contains("AlreadyVerified") + || err_str.contains("AlreadyInChain") + || err_str.contains("block is already committed to the state") + { + tracing::error!(?e, + "a BlockDownloadVerifyError that should have been filtered out was detected, \ + which possibly indicates a programming error in the downcast inside \ + zebrad::components::sync::downloads::Downloads::download_and_verify" + ) + } + + tracing::warn!(?e, "error downloading and verifying block"); + true + } + } + } } diff --git a/zebrad/src/components/sync/downloads.rs b/zebrad/src/components/sync/downloads.rs index ce5089c2d..634842045 100644 --- a/zebrad/src/components/sync/downloads.rs +++ b/zebrad/src/components/sync/downloads.rs @@ -12,6 +12,7 @@ use futures::{ stream::{FuturesUnordered, Stream}, }; use pin_project::pin_project; +use thiserror::Error; use tokio::{sync::oneshot, task::JoinHandle}; use tower::{hedge, Service, ServiceExt}; use tracing_futures::Instrument; @@ -33,6 +34,26 @@ impl hedge::Policy for AlwaysHedge { } } +/// Errors that can occur while downloading and verifying a block. +#[derive(Error, Debug)] +#[allow(dead_code)] +pub enum BlockDownloadVerifyError { + #[error("error downloading block")] + DownloadFailed(#[source] BoxError), + + #[error("error from the verifier service")] + VerifierError(#[source] BoxError), + + #[error("block did not pass consensus validation")] + Invalid(#[from] zebra_consensus::chain::VerifyChainError), + + #[error("block download / verification was cancelled during download")] + CancelledDuringDownload, + + #[error("block download / verification was cancelled during verification")] + CancelledDuringVerification, +} + /// Represents a [`Stream`] of download and verification tasks during chain sync. #[pin_project] #[derive(Debug)] @@ -54,7 +75,8 @@ where // Internal downloads state /// A list of pending block download and verify tasks. #[pin] - pending: FuturesUnordered>>, + pending: + FuturesUnordered>>, /// A list of channels that can be used to cancel pending block download and /// verify tasks. @@ -68,7 +90,7 @@ where ZV: Service, Response = block::Hash, Error = BoxError> + Send + Clone + 'static, ZV::Future: Send, { - type Item = Result; + type Item = Result; fn poll_next(self: Pin<&mut Self>, cx: &mut Context) -> Poll> { let this = self.project(); @@ -163,9 +185,9 @@ where _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to download completion"); metrics::counter!("sync.cancelled.download.count", 1); - return Err("canceled block_fetch_verify".into()) + return Err(BlockDownloadVerifyError::CancelledDuringDownload) } - rsp = block_req => rsp?, + rsp = block_req => rsp.map_err(BlockDownloadVerifyError::DownloadFailed)?, }; let block = if let zn::Response::Blocks(blocks) = rsp { @@ -178,14 +200,18 @@ where }; metrics::counter!("sync.downloaded.block.count", 1); - let rsp = verifier.ready_and().await?.call(block); + let rsp = verifier + .ready_and() + .await + .map_err(BlockDownloadVerifyError::VerifierError)? + .call(block); // TODO: if the verifier and cancel are both ready, which should // we prefer? (Currently, select! chooses one at random.) let verification = tokio::select! { _ = &mut cancel_rx => { tracing::trace!("task cancelled prior to verification"); metrics::counter!("sync.cancelled.verify.count", 1); - return Err("canceled block_fetch_verify".into()) + return Err(BlockDownloadVerifyError::CancelledDuringVerification) } verification = rsp => verification, }; @@ -193,7 +219,12 @@ where metrics::counter!("sync.verified.block.count", 1); } - verification + verification.map_err(|err| { + match err.downcast::() { + Ok(e) => BlockDownloadVerifyError::Invalid(*e), + Err(e) => BlockDownloadVerifyError::VerifierError(e), + } + }) } .in_current_span() // Tack the hash onto the error so we can remove the cancel handle