From d7b90552f3c8b9b3264e23755ce14a7a1eebe6fc Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 7 Jun 2023 09:38:48 +1000 Subject: [PATCH] fix(state): Avoid panicking on state errors during shutdown (#6828) * Enable cancel_all_background_work() only on macOS * Ignore expected "during shutdown" errors, and log other errors * Disable cancel_all_background_work() but keep the updated docs and error handling * Add the macOS shutdown crash to the README known issues --------- Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- README.md | 2 + .../src/service/finalized_state/disk_db.rs | 90 +++++++++++++++---- 2 files changed, 77 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index 9e3a253a9..98bd9a8af 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,8 @@ There are a few bugs in Zebra that we're still working on fixing: - Block download and verification sometimes times out during Zebra's initial sync [#5709](https://github.com/ZcashFoundation/zebra/issues/5709). The full sync still finishes reasonably quickly. +- Rust 1.70 [causes crashes during shutdown on macOS x86_64 (#6812)](https://github.com/ZcashFoundation/zebra/issues/6812). The state cache should stay valid despite the crash. + - No Windows support [#3801](https://github.com/ZcashFoundation/zebra/issues/3801). We used to test with Windows Server 2019, but not any more; see the issue for details. - Experimental Tor support is disabled until [Zebra upgrades to the latest `arti-client`](https://github.com/ZcashFoundation/zebra/issues/5492). This happened due to a Rust dependency conflict, which could only be resolved by `arti` upgrading to a version of `x25519-dalek` with the dependency fix. diff --git a/zebra-state/src/service/finalized_state/disk_db.rs b/zebra-state/src/service/finalized_state/disk_db.rs index 0432167d1..f896b3aac 100644 --- a/zebra-state/src/service/finalized_state/disk_db.rs +++ b/zebra-state/src/service/finalized_state/disk_db.rs @@ -780,18 +780,72 @@ impl DiskDb { let path = self.path(); debug!(?path, "flushing database to disk"); - self.db - .flush() - .expect("unexpected failure flushing SST data to disk"); - self.db - .flush_wal(true) - .expect("unexpected failure flushing WAL data to disk"); + // These flushes can fail during forced shutdown or during Drop after a shutdown, + // particularly in tests. If they fail, there's nothing we can do about it anyway. + if let Err(error) = self.db.flush() { + let error = format!("{error:?}"); + if error.to_ascii_lowercase().contains("shutdown in progress") { + debug!( + ?error, + ?path, + "expected shutdown error flushing database SST files to disk" + ); + } else { + info!( + ?error, + ?path, + "unexpected error flushing database SST files to disk during shutdown" + ); + } + } + if let Err(error) = self.db.flush_wal(true) { + let error = format!("{error:?}"); + if error.to_ascii_lowercase().contains("shutdown in progress") { + debug!( + ?error, + ?path, + "expected shutdown error flushing database WAL buffer to disk" + ); + } else { + info!( + ?error, + ?path, + "unexpected error flushing database WAL buffer to disk during shutdown" + ); + } + } + + // # Memory Safety + // // We'd like to call `cancel_all_background_work()` before Zebra exits, // but when we call it, we get memory, thread, or C++ errors when the process exits. // (This seems to be a bug in RocksDB: cancel_all_background_work() should wait until // all the threads have cleaned up.) // + // # Change History + // + // We've changed this setting multiple times since 2021, in response to new RocksDB + // and Rust compiler behaviour. + // + // We enabled cancel_all_background_work() due to failures on: + // - Rust 1.57 on Linux + // + // We disabled cancel_all_background_work() due to failures on: + // - Rust 1.64 on Linux + // + // We tried enabling cancel_all_background_work() due to failures on: + // - Rust 1.70 on macOS 12.6.5 on x86_64 + // but it didn't stop the aborts happening (PR #6820). + // + // There weren't any failures with cancel_all_background_work() disabled on: + // - Rust 1.69 or earlier + // - Linux with Rust 1.70 + // And with cancel_all_background_work() enabled or disabled on: + // - macOS 13.2 on aarch64 (M1), native and emulated x86_64, with Rust 1.70 + // + // # Detailed Description + // // We see these kinds of errors: // ``` // pthread lock: Invalid argument @@ -803,13 +857,26 @@ impl DiskDb { // signal: 11, SIGSEGV: invalid memory reference // ``` // + // # Reference + // // The RocksDB wiki says: // > Q: Is it safe to close RocksDB while another thread is issuing read, write or manual compaction requests? // > // > A: No. The users of RocksDB need to make sure all functions have finished before they close RocksDB. // > You can speed up the waiting by calling CancelAllBackgroundWork(). // - // https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ + // + // + // > rocksdb::DB instances need to be destroyed before your main function exits. + // > RocksDB instances usually depend on some internal static variables. + // > Users need to make sure rocksdb::DB instances are destroyed before those static variables. + // + // + // + // # TODO + // + // Try re-enabling this code and fixing the underlying concurrency bug. + // //info!(?path, "stopping background database tasks"); //self.db.cancel_all_background_work(true); @@ -818,14 +885,7 @@ impl DiskDb { // But Rust's ownership rules make that difficult, // so we just flush and delete ephemeral data instead. // - // The RocksDB wiki says: - // > rocksdb::DB instances need to be destroyed before your main function exits. - // > RocksDB instances usually depend on some internal static variables. - // > Users need to make sure rocksdb::DB instances are destroyed before those static variables. - // - // https://github.com/facebook/rocksdb/wiki/Known-Issues - // - // But this implementation doesn't seem to cause any issues, + // This implementation doesn't seem to cause any issues, // and the RocksDB Drop implementation handles any cleanup. self.delete_ephemeral(); }