fix(panic): Stop panicking on async task cancellation on shutdown in network and state futures (#7219)
* Add an async-error feature and an initial module structure * Implement checking for panics in OS threads and async tasks * Implement waiting for panics in OS threads and async tasks * Add a TODO to simplify some state request error handling * Use the new panic-checking methods in zebra-state * Use new panic-checking methods in zebra-network * fixup! Implement waiting for panics in OS threads and async tasks * Replace existing async code with generic panic-checking methods * Simplify trait to a single method * Move thread panic code into generic trait impls * Simplify option handling Co-authored-by: Arya <aryasolhi@gmail.com> * Fix comment Co-authored-by: Arya <aryasolhi@gmail.com> * Add missing track_caller --------- Co-authored-by: Arya <aryasolhi@gmail.com>
This commit is contained in:
parent
c885de4abb
commit
3bbe3cec4f
|
@ -24,6 +24,11 @@ json-conversion = [
|
|||
"serde_json",
|
||||
]
|
||||
|
||||
# Async error handling convenience traits
|
||||
async-error = [
|
||||
"tokio",
|
||||
]
|
||||
|
||||
# Experimental mining RPC support
|
||||
getblocktemplate-rpcs = [
|
||||
"zcash_address",
|
||||
|
@ -39,7 +44,7 @@ proptest-impl = [
|
|||
"proptest-derive",
|
||||
"rand",
|
||||
"rand_chacha",
|
||||
"tokio",
|
||||
"tokio/tracing",
|
||||
"zebra-test",
|
||||
]
|
||||
|
||||
|
@ -108,6 +113,9 @@ reddsa = "0.5.0"
|
|||
# Production feature json-conversion
|
||||
serde_json = { version = "1.0.100", optional = true }
|
||||
|
||||
# Production feature async-error and testing feature proptest-impl
|
||||
tokio = { version = "1.29.1", optional = true }
|
||||
|
||||
# Experimental feature getblocktemplate-rpcs
|
||||
zcash_address = { version = "0.3.0", optional = true }
|
||||
|
||||
|
@ -118,8 +126,6 @@ proptest-derive = { version = "0.3.0", optional = true }
|
|||
rand = { version = "0.8.5", optional = true }
|
||||
rand_chacha = { version = "0.3.1", optional = true }
|
||||
|
||||
tokio = { version = "1.29.1", features = ["tracing"], optional = true }
|
||||
|
||||
zebra-test = { path = "../zebra-test/", version = "1.0.0-beta.27", optional = true }
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
@ -1,6 +1,15 @@
|
|||
//! Tracing the execution time of functions.
|
||||
//!
|
||||
//! TODO: also trace polling time for futures, using a `Future` wrapper
|
||||
//! Diagnostic types and functions for Zebra:
|
||||
//! - code performance
|
||||
//! - task handling
|
||||
//! - errors and panics
|
||||
|
||||
pub mod task;
|
||||
|
||||
// Tracing the execution time of functions.
|
||||
//
|
||||
// TODO:
|
||||
// - move this to a `timing` submodule
|
||||
// - also trace polling time for futures, using a `Future` wrapper
|
||||
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
|
|
|
@ -0,0 +1,47 @@
|
|||
//! Diagnostic types and functions for Zebra tasks:
|
||||
//! - OS thread handling
|
||||
//! - async future task handling
|
||||
//! - errors and panics
|
||||
|
||||
#[cfg(feature = "async-error")]
|
||||
pub mod future;
|
||||
|
||||
pub mod thread;
|
||||
|
||||
/// A trait that checks a task's return value for panics.
|
||||
pub trait CheckForPanics {
|
||||
/// The output type, after removing panics from `Self`.
|
||||
type Output;
|
||||
|
||||
/// Check if `self` contains a panic payload or an unexpected termination, then panic.
|
||||
/// Otherwise, return the non-panic part of `self`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If `self` contains a panic payload or an unexpected termination.
|
||||
#[track_caller]
|
||||
fn check_for_panics(self) -> Self::Output;
|
||||
}
|
||||
|
||||
/// A trait that waits for a task to finish, then handles panics and cancellations.
|
||||
pub trait WaitForPanics {
|
||||
/// The underlying task output, after removing panics and unwrapping termination results.
|
||||
type Output;
|
||||
|
||||
/// Waits for `self` to finish, then check if its output is:
|
||||
/// - a panic payload: resume that panic,
|
||||
/// - an unexpected termination: panic with that error,
|
||||
/// - an expected termination: hang waiting for shutdown.
|
||||
///
|
||||
/// Otherwise, returns the task return value of `self`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If `self` contains a panic payload or an unexpected termination.
|
||||
///
|
||||
/// # Hangs
|
||||
///
|
||||
/// If `self` contains an expected termination, and we're shutting down anyway.
|
||||
#[track_caller]
|
||||
fn wait_for_panics(self) -> Self::Output;
|
||||
}
|
|
@ -0,0 +1,93 @@
|
|||
//! Diagnostic types and functions for Zebra async future tasks:
|
||||
//! - task handles
|
||||
//! - errors and panics
|
||||
|
||||
use std::{future, panic};
|
||||
|
||||
use futures::future::{BoxFuture, FutureExt};
|
||||
use tokio::task::{JoinError, JoinHandle};
|
||||
|
||||
use crate::shutdown::is_shutting_down;
|
||||
|
||||
use super::{CheckForPanics, WaitForPanics};
|
||||
|
||||
/// This is the return type of the [`JoinHandle`] future.
|
||||
impl<T> CheckForPanics for Result<T, JoinError> {
|
||||
/// The [`JoinHandle`]'s task output, after resuming any panics,
|
||||
/// and ignoring task cancellations on shutdown.
|
||||
type Output = Result<T, JoinError>;
|
||||
|
||||
/// Returns the task result if the task finished normally.
|
||||
/// Otherwise, resumes any panics, logs unexpected errors, and ignores any expected errors.
|
||||
///
|
||||
/// If the task finished normally, returns `Some(T)`.
|
||||
/// If the task was cancelled, returns `None`.
|
||||
#[track_caller]
|
||||
fn check_for_panics(self) -> Self::Output {
|
||||
match self {
|
||||
Ok(task_output) => Ok(task_output),
|
||||
Err(join_error) => Err(join_error.check_for_panics()),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl CheckForPanics for JoinError {
|
||||
/// The [`JoinError`] after resuming any panics, and logging any unexpected task cancellations.
|
||||
type Output = JoinError;
|
||||
|
||||
/// Resume any panics and panic on unexpected task cancellations.
|
||||
/// Always returns [`JoinError::Cancelled`](JoinError::is_cancelled).
|
||||
#[track_caller]
|
||||
fn check_for_panics(self) -> Self::Output {
|
||||
match self.try_into_panic() {
|
||||
Ok(panic_payload) => panic::resume_unwind(panic_payload),
|
||||
|
||||
// We could ignore this error, but then we'd have to change the return type.
|
||||
Err(task_cancelled) if is_shutting_down() => {
|
||||
debug!(
|
||||
?task_cancelled,
|
||||
"ignoring cancelled task because Zebra is shutting down"
|
||||
);
|
||||
|
||||
task_cancelled
|
||||
}
|
||||
|
||||
Err(task_cancelled) => {
|
||||
panic!("task cancelled during normal Zebra operation: {task_cancelled:?}");
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> WaitForPanics for JoinHandle<T>
|
||||
where
|
||||
T: Send + 'static,
|
||||
{
|
||||
type Output = BoxFuture<'static, T>;
|
||||
|
||||
/// Returns a future which waits for `self` to finish, then checks if its output is:
|
||||
/// - a panic payload: resume that panic,
|
||||
/// - an unexpected termination: panic with that error,
|
||||
/// - an expected termination: hang waiting for shutdown.
|
||||
///
|
||||
/// Otherwise, returns the task return value of `self`.
|
||||
///
|
||||
/// # Panics
|
||||
///
|
||||
/// If `self` contains a panic payload, or [`JoinHandle::abort()`] has been called on `self`.
|
||||
///
|
||||
/// # Hangs
|
||||
///
|
||||
/// If `self` contains an expected termination, and we're shutting down anyway.
|
||||
/// Futures hang by returning `Pending` and not setting a waker, so this uses minimal resources.
|
||||
#[track_caller]
|
||||
fn wait_for_panics(self) -> Self::Output {
|
||||
async move {
|
||||
match self.await.check_for_panics() {
|
||||
Ok(task_output) => task_output,
|
||||
Err(_expected_cancel_error) => future::pending().await,
|
||||
}
|
||||
}
|
||||
.boxed()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,108 @@
|
|||
//! Diagnostic types and functions for Zebra OS thread tasks:
|
||||
//! - task handles
|
||||
//! - errors and panics
|
||||
|
||||
use std::{
|
||||
panic,
|
||||
sync::Arc,
|
||||
thread::{self, JoinHandle},
|
||||
};
|
||||
|
||||
use super::{CheckForPanics, WaitForPanics};
|
||||
|
||||
impl<T> CheckForPanics for thread::Result<T> {
|
||||
type Output = T;
|
||||
|
||||
/// Panics if the thread panicked.
|
||||
///
|
||||
/// Threads can't be cancelled except by using a panic, so there are no thread errors here.
|
||||
#[track_caller]
|
||||
fn check_for_panics(self) -> Self::Output {
|
||||
match self {
|
||||
// The value returned by the thread when it finished.
|
||||
Ok(thread_output) => thread_output,
|
||||
|
||||
// A thread error is always a panic.
|
||||
Err(panic_payload) => panic::resume_unwind(panic_payload),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> WaitForPanics for JoinHandle<T> {
|
||||
type Output = T;
|
||||
|
||||
/// Waits for the thread to finish, then panics if the thread panicked.
|
||||
#[track_caller]
|
||||
fn wait_for_panics(self) -> Self::Output {
|
||||
self.join().check_for_panics()
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> WaitForPanics for Arc<JoinHandle<T>> {
|
||||
type Output = Option<T>;
|
||||
|
||||
/// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
|
||||
/// panicked. Otherwise, returns the thread's return value.
|
||||
///
|
||||
/// If this is not the final `Arc`, drops the handle and immediately returns `None`.
|
||||
#[track_caller]
|
||||
fn wait_for_panics(self) -> Self::Output {
|
||||
// If we are the last Arc with a reference to this handle,
|
||||
// we can wait for it and propagate any panics.
|
||||
//
|
||||
// We use into_inner() because it guarantees that exactly one of the tasks gets the
|
||||
// JoinHandle. try_unwrap() lets us keep the JoinHandle, but it can also miss panics.
|
||||
//
|
||||
// This is more readable as an expanded statement.
|
||||
#[allow(clippy::manual_map)]
|
||||
if let Some(handle) = Arc::into_inner(self) {
|
||||
Some(handle.wait_for_panics())
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> CheckForPanics for &mut Option<Arc<JoinHandle<T>>> {
|
||||
type Output = Option<T>;
|
||||
|
||||
/// If this is the final `Arc`, checks if the thread has finished, then panics if the thread
|
||||
/// panicked. Otherwise, returns the thread's return value.
|
||||
///
|
||||
/// If the thread has not finished, or this is not the final `Arc`, returns `None`.
|
||||
#[track_caller]
|
||||
fn check_for_panics(self) -> Self::Output {
|
||||
let handle = self.take()?;
|
||||
|
||||
if handle.is_finished() {
|
||||
// This is the same as calling `self.wait_for_panics()`, but we can't do that,
|
||||
// because we've taken `self`.
|
||||
#[allow(clippy::manual_map)]
|
||||
return handle.wait_for_panics();
|
||||
}
|
||||
|
||||
*self = Some(handle);
|
||||
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
impl<T> WaitForPanics for &mut Option<Arc<JoinHandle<T>>> {
|
||||
type Output = Option<T>;
|
||||
|
||||
/// If this is the final `Arc`, waits for the thread to finish, then panics if the thread
|
||||
/// panicked. Otherwise, returns the thread's return value.
|
||||
///
|
||||
/// If this is not the final `Arc`, drops the handle and returns `None`.
|
||||
#[track_caller]
|
||||
fn wait_for_panics(self) -> Self::Output {
|
||||
// This is more readable as an expanded statement.
|
||||
#[allow(clippy::manual_map)]
|
||||
if let Some(output) = self.take()?.wait_for_panics() {
|
||||
Some(output)
|
||||
} else {
|
||||
// Some other task has a reference, so we should give up ours to let them use it.
|
||||
None
|
||||
}
|
||||
}
|
||||
}
|
|
@ -83,7 +83,7 @@ howudoin = { version = "0.1.2", optional = true }
|
|||
proptest = { version = "1.2.0", optional = true }
|
||||
proptest-derive = { version = "0.3.0", optional = true }
|
||||
|
||||
zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27" }
|
||||
zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27", features = ["async-error"] }
|
||||
|
||||
[dev-dependencies]
|
||||
proptest = "1.2.0"
|
||||
|
|
|
@ -8,7 +8,7 @@ use tokio::time::{sleep_until, timeout, Instant};
|
|||
use tower::{Service, ServiceExt};
|
||||
use tracing::Span;
|
||||
|
||||
use zebra_chain::serialization::DateTime32;
|
||||
use zebra_chain::{diagnostic::task::WaitForPanics, serialization::DateTime32};
|
||||
|
||||
use crate::{
|
||||
constants, meta_addr::MetaAddrChange, peer_set::set::MorePeers, types::MetaAddr, AddressBook,
|
||||
|
@ -348,8 +348,8 @@ where
|
|||
tokio::task::spawn_blocking(move || {
|
||||
span.in_scope(|| address_book.lock().unwrap().extend(addrs))
|
||||
})
|
||||
.wait_for_panics()
|
||||
.await
|
||||
.expect("panic in new peers address book update task");
|
||||
}
|
||||
|
||||
/// Returns the next candidate for a connection attempt, if any are available.
|
||||
|
@ -403,8 +403,8 @@ where
|
|||
// Correctness: Spawn address book accesses on a blocking thread, to avoid deadlocks (see #1976).
|
||||
let span = Span::current();
|
||||
let next_peer = tokio::task::spawn_blocking(move || span.in_scope(next_peer))
|
||||
.await
|
||||
.expect("panic in next peer address book task")?;
|
||||
.wait_for_panics()
|
||||
.await?;
|
||||
|
||||
// Security: rate-limit new outbound peer connections
|
||||
sleep_until(self.min_next_handshake).await;
|
||||
|
|
|
@ -23,8 +23,7 @@ use rand::seq::SliceRandom;
|
|||
use tokio::{
|
||||
net::{TcpListener, TcpStream},
|
||||
sync::broadcast,
|
||||
task::JoinError,
|
||||
time::{error::Elapsed, sleep, Instant},
|
||||
time::{sleep, Instant},
|
||||
};
|
||||
use tokio_stream::wrappers::IntervalStream;
|
||||
use tower::{
|
||||
|
@ -33,11 +32,11 @@ use tower::{
|
|||
use tracing::Span;
|
||||
use tracing_futures::Instrument;
|
||||
|
||||
use zebra_chain::chain_tip::ChainTip;
|
||||
use zebra_chain::{chain_tip::ChainTip, diagnostic::task::WaitForPanics};
|
||||
|
||||
use crate::{
|
||||
address_book_updater::AddressBookUpdater,
|
||||
constants::{self, HANDSHAKE_TIMEOUT},
|
||||
constants,
|
||||
meta_addr::{MetaAddr, MetaAddrChange},
|
||||
peer::{
|
||||
self, address_is_valid_for_inbound_listeners, HandshakeRequest, MinimumPeerVersion,
|
||||
|
@ -207,18 +206,8 @@ where
|
|||
|
||||
// Wait for the initial seed peer count
|
||||
let mut active_outbound_connections = initial_peers_join
|
||||
.wait_for_panics()
|
||||
.await
|
||||
.unwrap_or_else(|e @ JoinError { .. }| {
|
||||
if e.is_panic() {
|
||||
panic!("panic in initial peer connections task: {e:?}");
|
||||
} else {
|
||||
info!(
|
||||
"task error during initial peer connections: {e:?},\
|
||||
is Zebra shutting down?"
|
||||
);
|
||||
Err(e.into())
|
||||
}
|
||||
})
|
||||
.expect("unexpected error connecting to initial peers");
|
||||
let active_initial_peer_count = active_outbound_connections.update_count();
|
||||
|
||||
|
@ -354,22 +343,11 @@ where
|
|||
}
|
||||
.in_current_span(),
|
||||
)
|
||||
.wait_for_panics()
|
||||
})
|
||||
.collect();
|
||||
|
||||
while let Some(handshake_result) = handshakes.next().await {
|
||||
let handshake_result = handshake_result.unwrap_or_else(|e @ JoinError { .. }| {
|
||||
if e.is_panic() {
|
||||
panic!("panic in initial peer connection: {e:?}");
|
||||
} else {
|
||||
info!(
|
||||
"task error during initial peer connection: {e:?},\
|
||||
is Zebra shutting down?"
|
||||
);
|
||||
// Fake the address, it doesn't matter because we're shutting down anyway
|
||||
Err((PeerSocketAddr::unspecified(), e.into()))
|
||||
}
|
||||
});
|
||||
match handshake_result {
|
||||
Ok(change) => {
|
||||
handshake_success_total += 1;
|
||||
|
@ -637,36 +615,9 @@ where
|
|||
peerset_tx.clone(),
|
||||
)
|
||||
.await?
|
||||
.map(|res| match res {
|
||||
Ok(()) => (),
|
||||
Err(e @ JoinError { .. }) => {
|
||||
if e.is_panic() {
|
||||
panic!("panic during inbound handshaking: {e:?}");
|
||||
} else {
|
||||
info!(
|
||||
"task error during inbound handshaking: {e:?}, is Zebra shutting down?"
|
||||
)
|
||||
}
|
||||
}
|
||||
});
|
||||
.wait_for_panics();
|
||||
|
||||
let handshake_timeout = tokio::time::timeout(
|
||||
// Only trigger this timeout if the inner handshake timeout fails
|
||||
HANDSHAKE_TIMEOUT + Duration::from_millis(500),
|
||||
handshake_task,
|
||||
)
|
||||
.map(|res| match res {
|
||||
Ok(()) => (),
|
||||
Err(_e @ Elapsed { .. }) => {
|
||||
info!(
|
||||
"timeout in spawned accept_inbound_handshake() task: \
|
||||
inner task should have timed out already"
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
// This timeout helps locate inbound peer connection hangs, see #6763 for details.
|
||||
handshakes.push(Box::pin(handshake_timeout));
|
||||
handshakes.push(handshake_task);
|
||||
|
||||
// Rate-limit inbound connection handshakes.
|
||||
// But sleep longer after a successful connection,
|
||||
|
@ -918,80 +869,64 @@ where
|
|||
|
||||
// Spawn each handshake or crawl into an independent task, so handshakes can make
|
||||
// progress while crawls are running.
|
||||
let handshake_or_crawl_handle = tokio::spawn(async move {
|
||||
// Try to get the next available peer for a handshake.
|
||||
//
|
||||
// candidates.next() has a short timeout, and briefly holds the address
|
||||
// book lock, so it shouldn't hang.
|
||||
//
|
||||
// Hold the lock for as short a time as possible.
|
||||
let candidate = { candidates.lock().await.next().await };
|
||||
let handshake_or_crawl_handle = tokio::spawn(
|
||||
async move {
|
||||
// Try to get the next available peer for a handshake.
|
||||
//
|
||||
// candidates.next() has a short timeout, and briefly holds the address
|
||||
// book lock, so it shouldn't hang.
|
||||
//
|
||||
// Hold the lock for as short a time as possible.
|
||||
let candidate = { candidates.lock().await.next().await };
|
||||
|
||||
if let Some(candidate) = candidate {
|
||||
// we don't need to spawn here, because there's nothing running concurrently
|
||||
dial(
|
||||
candidate,
|
||||
outbound_connector,
|
||||
outbound_connection_tracker,
|
||||
peerset_tx,
|
||||
address_book,
|
||||
demand_tx,
|
||||
)
|
||||
.await?;
|
||||
if let Some(candidate) = candidate {
|
||||
// we don't need to spawn here, because there's nothing running concurrently
|
||||
dial(
|
||||
candidate,
|
||||
outbound_connector,
|
||||
outbound_connection_tracker,
|
||||
peerset_tx,
|
||||
address_book,
|
||||
demand_tx,
|
||||
)
|
||||
.await?;
|
||||
|
||||
Ok(HandshakeFinished)
|
||||
} else {
|
||||
// There weren't any peers, so try to get more peers.
|
||||
debug!("demand for peers but no available candidates");
|
||||
|
||||
crawl(candidates, demand_tx).await?;
|
||||
|
||||
Ok(DemandCrawlFinished)
|
||||
}
|
||||
}.in_current_span())
|
||||
.map(|res| match res {
|
||||
Ok(crawler_action) => crawler_action,
|
||||
Err(e @ JoinError {..}) => {
|
||||
if e.is_panic() {
|
||||
panic!("panic during outbound handshake: {e:?}");
|
||||
Ok(HandshakeFinished)
|
||||
} else {
|
||||
info!("task error during outbound handshake: {e:?}, is Zebra shutting down?")
|
||||
}
|
||||
// Just fake it
|
||||
Ok(HandshakeFinished)
|
||||
}
|
||||
});
|
||||
// There weren't any peers, so try to get more peers.
|
||||
debug!("demand for peers but no available candidates");
|
||||
|
||||
handshakes.push(Box::pin(handshake_or_crawl_handle));
|
||||
crawl(candidates, demand_tx).await?;
|
||||
|
||||
Ok(DemandCrawlFinished)
|
||||
}
|
||||
}
|
||||
.in_current_span(),
|
||||
)
|
||||
.wait_for_panics();
|
||||
|
||||
handshakes.push(handshake_or_crawl_handle);
|
||||
}
|
||||
Ok(TimerCrawl { tick }) => {
|
||||
let candidates = candidates.clone();
|
||||
let demand_tx = demand_tx.clone();
|
||||
|
||||
let crawl_handle = tokio::spawn(async move {
|
||||
debug!(
|
||||
?tick,
|
||||
"crawling for more peers in response to the crawl timer"
|
||||
);
|
||||
let crawl_handle = tokio::spawn(
|
||||
async move {
|
||||
debug!(
|
||||
?tick,
|
||||
"crawling for more peers in response to the crawl timer"
|
||||
);
|
||||
|
||||
crawl(candidates, demand_tx).await?;
|
||||
crawl(candidates, demand_tx).await?;
|
||||
|
||||
Ok(TimerCrawlFinished)
|
||||
}.in_current_span())
|
||||
.map(move |res| match res {
|
||||
Ok(crawler_action) => crawler_action,
|
||||
Err(e @ JoinError {..}) => {
|
||||
if e.is_panic() {
|
||||
panic!("panic during outbound TimerCrawl: {tick:?} {e:?}");
|
||||
} else {
|
||||
info!("task error during outbound TimerCrawl: {e:?}, is Zebra shutting down?")
|
||||
}
|
||||
// Just fake it
|
||||
Ok(TimerCrawlFinished)
|
||||
}
|
||||
});
|
||||
.in_current_span(),
|
||||
)
|
||||
.wait_for_panics();
|
||||
|
||||
handshakes.push(Box::pin(crawl_handle));
|
||||
handshakes.push(crawl_handle);
|
||||
}
|
||||
|
||||
// Completed spawned tasks
|
||||
|
@ -1162,27 +1097,16 @@ async fn report_failed(address_book: Arc<std::sync::Mutex<AddressBook>>, addr: M
|
|||
//
|
||||
// Spawn address book accesses on a blocking thread, to avoid deadlocks (see #1976).
|
||||
let span = Span::current();
|
||||
let task_result = tokio::task::spawn_blocking(move || {
|
||||
let updated_addr = tokio::task::spawn_blocking(move || {
|
||||
span.in_scope(|| address_book.lock().unwrap().update(addr))
|
||||
})
|
||||
.wait_for_panics()
|
||||
.await;
|
||||
|
||||
match task_result {
|
||||
Ok(updated_addr) => assert_eq!(
|
||||
updated_addr.map(|addr| addr.addr()),
|
||||
Some(addr.addr()),
|
||||
"incorrect address updated by address book: \
|
||||
original: {addr:?}, updated: {updated_addr:?}"
|
||||
),
|
||||
Err(e @ JoinError { .. }) => {
|
||||
if e.is_panic() {
|
||||
panic!("panic in peer failure address book update task: {e:?}");
|
||||
} else {
|
||||
info!(
|
||||
"task error during peer failure address book update task: {e:?},\
|
||||
is Zebra shutting down?"
|
||||
)
|
||||
}
|
||||
}
|
||||
}
|
||||
assert_eq!(
|
||||
updated_addr.map(|addr| addr.addr()),
|
||||
Some(addr.addr()),
|
||||
"incorrect address updated by address book: \
|
||||
original: {addr:?}, updated: {updated_addr:?}"
|
||||
);
|
||||
}
|
||||
|
|
|
@ -71,7 +71,7 @@ tracing = "0.1.37"
|
|||
elasticsearch = { version = "8.5.0-alpha.1", default-features = false, features = ["rustls-tls"], optional = true }
|
||||
serde_json = { version = "1.0.100", package = "serde_json", optional = true }
|
||||
|
||||
zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27" }
|
||||
zebra-chain = { path = "../zebra-chain", version = "1.0.0-beta.27", features = ["async-error"] }
|
||||
|
||||
# prod feature progress-bar
|
||||
howudoin = { version = "0.1.2", optional = true }
|
||||
|
|
|
@ -32,6 +32,9 @@ pub enum Response {
|
|||
Depth(Option<u32>),
|
||||
|
||||
/// Response to [`Request::Tip`] with the current best chain tip.
|
||||
//
|
||||
// TODO: remove this request, and replace it with a call to
|
||||
// `LatestChainTip::best_tip_height_and_hash()`
|
||||
Tip(Option<(block::Height, block::Hash)>),
|
||||
|
||||
/// Response to [`Request::BlockLocator`] with a block locator object.
|
||||
|
|
|
@ -43,7 +43,7 @@ use tower::buffer::Buffer;
|
|||
|
||||
use zebra_chain::{
|
||||
block::{self, CountedHeader, HeightDiff},
|
||||
diagnostic::CodeTimer,
|
||||
diagnostic::{task::WaitForPanics, CodeTimer},
|
||||
parameters::{Network, NetworkUpgrade},
|
||||
};
|
||||
|
||||
|
@ -1209,8 +1209,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::Tip(tip))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::Tip"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the StateService.
|
||||
|
@ -1231,8 +1230,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::Depth(depth))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::Depth"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the StateService.
|
||||
|
@ -1255,10 +1253,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::BestChainNextMedianTimePast(median_time_past?))
|
||||
})
|
||||
})
|
||||
.map(|join_result| {
|
||||
join_result.expect("panic in ReadRequest::BestChainNextMedianTimePast")
|
||||
})
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the get_block (raw) RPC and the StateService.
|
||||
|
@ -1283,8 +1278,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::Block(block))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::Block"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// For the get_raw_transaction RPC and the StateService.
|
||||
|
@ -1302,8 +1296,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::Transaction(response))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::Transaction"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the getblock (verbose) RPC.
|
||||
|
@ -1332,10 +1325,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::TransactionIdsForBlock(transaction_ids))
|
||||
})
|
||||
})
|
||||
.map(|join_result| {
|
||||
join_result.expect("panic in ReadRequest::TransactionIdsForBlock")
|
||||
})
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
ReadRequest::UnspentBestChainUtxo(outpoint) => {
|
||||
|
@ -1359,8 +1349,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::UnspentBestChainUtxo(utxo))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::UnspentBestChainUtxo"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Manually used by the StateService to implement part of AwaitUtxo.
|
||||
|
@ -1381,8 +1370,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::AnyChainUtxo(utxo))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::AnyChainUtxo"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the StateService.
|
||||
|
@ -1405,8 +1393,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::BlockLocator"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the StateService.
|
||||
|
@ -1433,8 +1420,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::BlockHashes(block_hashes))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::FindBlockHashes"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the StateService.
|
||||
|
@ -1466,8 +1452,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::BlockHeaders(block_headers))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::FindBlockHeaders"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
ReadRequest::SaplingTree(hash_or_height) => {
|
||||
|
@ -1491,8 +1476,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::SaplingTree(sapling_tree))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::SaplingTree"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
ReadRequest::OrchardTree(hash_or_height) => {
|
||||
|
@ -1516,8 +1500,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::OrchardTree(orchard_tree))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::OrchardTree"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// For the get_address_balance RPC.
|
||||
|
@ -1542,8 +1525,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::AddressBalance(balance))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::AddressBalance"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// For the get_address_tx_ids RPC.
|
||||
|
@ -1576,10 +1558,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
tx_ids.map(ReadResponse::AddressesTransactionIds)
|
||||
})
|
||||
})
|
||||
.map(|join_result| {
|
||||
join_result.expect("panic in ReadRequest::TransactionIdsByAddresses")
|
||||
})
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// For the get_address_utxos RPC.
|
||||
|
@ -1605,8 +1584,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
utxos.map(ReadResponse::AddressUtxos)
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::UtxosByAddresses"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
ReadRequest::CheckBestChainTipNullifiersAndAnchors(unmined_tx) => {
|
||||
|
@ -1639,11 +1617,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::ValidBestChainTipNullifiersAndAnchors)
|
||||
})
|
||||
})
|
||||
.map(|join_result| {
|
||||
join_result
|
||||
.expect("panic in ReadRequest::CheckBestChainTipNullifiersAndAnchors")
|
||||
})
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by the get_block and get_block_hash RPCs.
|
||||
|
@ -1672,8 +1646,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::BlockHash(hash))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::BestChainBlockHash"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by get_block_template RPC.
|
||||
|
@ -1712,8 +1685,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
get_block_template_info.map(ReadResponse::ChainInfo)
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::ChainInfo"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
// Used by getmininginfo, getnetworksolps, and getnetworkhashps RPCs.
|
||||
|
@ -1766,8 +1738,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::SolutionRate(solution_rate))
|
||||
})
|
||||
})
|
||||
.map(|join_result| join_result.expect("panic in ReadRequest::SolutionRate"))
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
|
||||
#[cfg(feature = "getblocktemplate-rpcs")]
|
||||
|
@ -1815,10 +1786,7 @@ impl Service<ReadRequest> for ReadStateService {
|
|||
Ok(ReadResponse::ValidBlockProposal)
|
||||
})
|
||||
})
|
||||
.map(|join_result| {
|
||||
join_result.expect("panic in ReadRequest::CheckBlockProposalValidity")
|
||||
})
|
||||
.boxed()
|
||||
.wait_for_panics()
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -2,7 +2,6 @@
|
|||
|
||||
use std::{
|
||||
cmp::Ordering,
|
||||
panic,
|
||||
sync::{mpsc, Arc},
|
||||
thread::{self, JoinHandle},
|
||||
};
|
||||
|
@ -10,7 +9,11 @@ use std::{
|
|||
use semver::Version;
|
||||
use tracing::Span;
|
||||
|
||||
use zebra_chain::{block::Height, parameters::Network};
|
||||
use zebra_chain::{
|
||||
block::Height,
|
||||
diagnostic::task::{CheckForPanics, WaitForPanics},
|
||||
parameters::Network,
|
||||
};
|
||||
|
||||
use DbFormatChange::*;
|
||||
|
||||
|
@ -482,42 +485,16 @@ impl DbFormatChangeThreadHandle {
|
|||
///
|
||||
/// This method should be called regularly, so that panics are detected as soon as possible.
|
||||
pub fn check_for_panics(&mut self) {
|
||||
let update_task = self.update_task.take();
|
||||
|
||||
if let Some(update_task) = update_task {
|
||||
if update_task.is_finished() {
|
||||
// We use into_inner() because it guarantees that exactly one of the tasks
|
||||
// gets the JoinHandle. try_unwrap() lets us keep the JoinHandle, but it can also
|
||||
// miss panics.
|
||||
if let Some(update_task) = Arc::into_inner(update_task) {
|
||||
// We are the last handle with a reference to this task,
|
||||
// so we can propagate any panics
|
||||
if let Err(thread_panic) = update_task.join() {
|
||||
panic::resume_unwind(thread_panic);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// It hasn't finished, so we need to put it back
|
||||
self.update_task = Some(update_task);
|
||||
}
|
||||
}
|
||||
self.update_task.check_for_panics();
|
||||
}
|
||||
|
||||
/// Wait for the spawned thread to finish. If it exited with a panic, resume that panic.
|
||||
///
|
||||
/// Exits early if the thread has other outstanding handles.
|
||||
///
|
||||
/// This method should be called during shutdown.
|
||||
pub fn wait_for_panics(&mut self) {
|
||||
if let Some(update_task) = self.update_task.take() {
|
||||
// We use into_inner() because it guarantees that exactly one of the tasks
|
||||
// gets the JoinHandle. See the comments in check_for_panics().
|
||||
if let Some(update_task) = Arc::into_inner(update_task) {
|
||||
// We are the last handle with a reference to this task,
|
||||
// so we can propagate any panics
|
||||
if let Err(thread_panic) = update_task.join() {
|
||||
panic::resume_unwind(thread_panic);
|
||||
}
|
||||
}
|
||||
}
|
||||
self.update_task.wait_for_panics();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue