From 355f1233f5421ae860ffbffaa48129d6274ffdc4 Mon Sep 17 00:00:00 2001 From: teor Date: Wed, 7 Jun 2023 07:18:57 +1000 Subject: [PATCH] change(db): Make the first stable release forward-compatible with planned state changes (#6813) * Implement minor and patch database format versions * Log and update database format versions when opening database * Refactor the current list of column families into a constant * Open all available column families, including from future Zebra versions * Refactor note commitment tree lookups to go through the height methods * Make Sapling/Orchard note commitment tree lookup forwards compatible * Ignore errors reading column family lists from disk * Update format version comments and TODOs * Correctly log newly created database formats --------- Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com> --- Cargo.lock | 1 + zebra-state/Cargo.toml | 1 + zebra-state/src/config.rs | 98 ++++++++++- zebra-state/src/constants.rs | 47 +++++- .../src/service/finalized_state/disk_db.rs | 156 ++++++++++++------ .../finalized_state/zebra_db/shielded.rs | 63 +++++-- 6 files changed, 292 insertions(+), 74 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3bc9c5fee..072c69cff 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -5962,6 +5962,7 @@ dependencies = [ "regex", "rlimit", "rocksdb", + "semver 1.0.17", "serde", "serde_json", "spandoc", diff --git a/zebra-state/Cargo.toml b/zebra-state/Cargo.toml index fe4e18add..8caba9ecb 100644 --- a/zebra-state/Cargo.toml +++ b/zebra-state/Cargo.toml @@ -46,6 +46,7 @@ mset = "0.1.1" regex = "1.8.4" rlimit = "0.9.1" rocksdb = { version = "0.21.0", default_features = false, features = ["lz4"] } +semver = "1.0.17" serde = { version = "1.0.163", features = ["serde_derive"] } tempfile = "3.5.0" thiserror = "1.0.40" diff --git a/zebra-state/src/config.rs b/zebra-state/src/config.rs index 0475aae6b..3d09aecdc 100644 --- a/zebra-state/src/config.rs +++ b/zebra-state/src/config.rs @@ -1,16 +1,26 @@ //! Cached state configuration for Zebra. use std::{ - fs::{canonicalize, remove_dir_all, DirEntry, ReadDir}, + fs::{self, canonicalize, remove_dir_all, DirEntry, ReadDir}, + io::ErrorKind, path::{Path, PathBuf}, }; +use semver::Version; use serde::{Deserialize, Serialize}; use tokio::task::{spawn_blocking, JoinHandle}; use tracing::Span; use zebra_chain::parameters::Network; +use crate::{ + constants::{ + DATABASE_FORMAT_MINOR_VERSION, DATABASE_FORMAT_PATCH_VERSION, DATABASE_FORMAT_VERSION, + DATABASE_FORMAT_VERSION_FILE_NAME, + }, + BoxError, +}; + /// Configuration for the state service. #[derive(Clone, Debug, Deserialize, Serialize)] #[serde(deny_unknown_fields, default)] @@ -125,6 +135,15 @@ impl Config { } } + /// Returns the path of the database format version file. + pub fn version_file_path(&self, network: Network) -> PathBuf { + let mut version_path = self.db_path(network); + + version_path.push(DATABASE_FORMAT_VERSION_FILE_NAME); + + version_path + } + /// Construct a config for an ephemeral database pub fn ephemeral() -> Config { Config { @@ -267,8 +286,83 @@ fn parse_dir_name(entry: &DirEntry) -> Option { /// Parse the state version number from `dir_name`. /// /// Returns `None` if parsing fails, or the directory name is not in the expected format. -fn parse_version_number(dir_name: &str) -> Option { +fn parse_version_number(dir_name: &str) -> Option { dir_name .strip_prefix('v') .and_then(|version| version.parse().ok()) } + +/// Returns the full semantic version of the currently running database format code. +/// +/// This is the version implemented by the Zebra code that's currently running, +/// the minor and patch versions on disk can be different. +pub fn database_format_version_in_code() -> Version { + Version::new( + DATABASE_FORMAT_VERSION, + DATABASE_FORMAT_MINOR_VERSION, + DATABASE_FORMAT_PATCH_VERSION, + ) +} + +/// Returns the full semantic version of the on-disk database. +/// If there is no existing on-disk database, returns `Ok(None)`. +/// +/// This is the format of the data on disk, the minor and patch versions +/// implemented by the running Zebra code can be different. +pub fn database_format_version_on_disk( + config: &Config, + network: Network, +) -> Result, BoxError> { + let version_path = config.version_file_path(network); + + let version = match fs::read_to_string(version_path) { + Ok(version) => version, + Err(e) if e.kind() == ErrorKind::NotFound => { + // If the version file doesn't exist, don't guess the version. + // (It will end up being the version in code, once the database is created.) + return Ok(None); + } + Err(e) => Err(e)?, + }; + + let (minor, patch) = version + .split_once('.') + .ok_or("invalid database format version file")?; + + Ok(Some(Version::new( + DATABASE_FORMAT_VERSION, + minor.parse()?, + patch.parse()?, + ))) +} + +/// Writes the currently running semantic database version to the on-disk database. +/// +/// # Correctness +/// +/// This should only be called after all running format upgrades are complete. +/// +/// # Concurrency +/// +/// This must only be called while RocksDB has an open database for `config`. +/// Otherwise, multiple Zebra processes could write the version at the same time, +/// corrupting the file. +pub fn write_database_format_version_to_disk( + config: &Config, + network: Network, +) -> Result<(), BoxError> { + let version_path = config.version_file_path(network); + + // The major version is already in the directory path. + let version = format!( + "{}.{}", + DATABASE_FORMAT_MINOR_VERSION, DATABASE_FORMAT_PATCH_VERSION + ); + + // # Concurrency + // + // The caller handles locking for this file write. + fs::write(version_path, version.as_bytes())?; + + Ok(()) +} diff --git a/zebra-state/src/constants.rs b/zebra-state/src/constants.rs index 85ae1e77d..011b4115e 100644 --- a/zebra-state/src/constants.rs +++ b/zebra-state/src/constants.rs @@ -1,4 +1,11 @@ -//! Definitions of constants. +//! Constants that impact state behaviour. + +use lazy_static::lazy_static; +use regex::Regex; + +// For doc comment links +#[allow(unused_imports)] +use crate::config::{database_format_version_in_code, database_format_version_on_disk}; pub use zebra_chain::transparent::MIN_TRANSPARENT_COINBASE_MATURITY; @@ -19,13 +26,42 @@ pub use zebra_chain::transparent::MIN_TRANSPARENT_COINBASE_MATURITY; // TODO: change to HeightDiff pub const MAX_BLOCK_REORG_HEIGHT: u32 = MIN_TRANSPARENT_COINBASE_MATURITY - 1; -/// The database format version, incremented each time the database format changes. -pub const DATABASE_FORMAT_VERSION: u32 = 25; +/// The database format major version, incremented each time the on-disk database format has a +/// breaking data format change. +/// +/// Breaking changes include: +/// - deleting a column family, or +/// - changing a column family's data format in an incompatible way. +/// +/// Breaking changes become minor version changes if: +/// - we previously added compatibility code, and +/// - it's available in all supported Zebra versions. +/// +/// Use [`database_format_version_in_code()`] or [`database_format_version_on_disk()`] +/// to get the full semantic format version. +pub const DATABASE_FORMAT_VERSION: u64 = 25; + +/// The database format minor version, incremented each time the on-disk database format has a +/// significant data format change. +/// +/// Significant changes include: +/// - adding new column families, +/// - changing the format of a column family in a compatible way, or +/// - breaking changes with compatibility code in all supported Zebra versions. +pub const DATABASE_FORMAT_MINOR_VERSION: u64 = 0; + +/// The database format patch version, incremented each time the on-disk database format has a +/// significant format compatibility fix. +pub const DATABASE_FORMAT_PATCH_VERSION: u64 = 1; + +/// The name of the file containing the minor and patch database versions. +pub const DATABASE_FORMAT_VERSION_FILE_NAME: &str = "version"; /// The maximum number of blocks to check for NU5 transactions, /// before we assume we are on a pre-NU5 legacy chain. /// -/// Zebra usually only has to check back a few blocks, but on testnet it can be a long time between v5 transactions. +/// Zebra usually only has to check back a few blocks on mainnet, but on testnet it can be a long +/// time between v5 transactions. pub const MAX_LEGACY_CHAIN_BLOCKS: usize = 100_000; /// The maximum number of non-finalized chain forks Zebra will track. @@ -58,9 +94,6 @@ const MAX_FIND_BLOCK_HEADERS_RESULTS_FOR_PROTOCOL: u32 = 160; pub const MAX_FIND_BLOCK_HEADERS_RESULTS_FOR_ZEBRA: u32 = MAX_FIND_BLOCK_HEADERS_RESULTS_FOR_PROTOCOL - 2; -use lazy_static::lazy_static; -use regex::Regex; - lazy_static! { /// Regex that matches the RocksDB error when its lock file is already open. pub static ref LOCK_FILE_ERROR: Regex = Regex::new("(lock file).*(temporarily unavailable)|(in use)|(being used by another process)").expect("regex is valid"); diff --git a/zebra-state/src/service/finalized_state/disk_db.rs b/zebra-state/src/service/finalized_state/disk_db.rs index 3c732acc4..0432167d1 100644 --- a/zebra-state/src/service/finalized_state/disk_db.rs +++ b/zebra-state/src/service/finalized_state/disk_db.rs @@ -10,13 +10,18 @@ //! The [`crate::constants::DATABASE_FORMAT_VERSION`] constant must //! be incremented each time the database format (column, serialization, etc) changes. -use std::{fmt::Debug, path::Path, sync::Arc}; +use std::{cmp::Ordering, fmt::Debug, path::Path, sync::Arc}; +use itertools::Itertools; use rlimit::increase_nofile_limit; use zebra_chain::parameters::Network; use crate::{ + config::{ + database_format_version_in_code, database_format_version_on_disk, + write_database_format_version_to_disk, + }, service::finalized_state::disk_format::{FromDisk, IntoDisk}, Config, }; @@ -386,61 +391,93 @@ impl DiskDb { /// const MEMTABLE_RAM_CACHE_MEGABYTES: usize = 128; + /// The column families supported by the running database code. + const COLUMN_FAMILIES_IN_CODE: &[&'static str] = &[ + // Blocks + "hash_by_height", + "height_by_hash", + "block_header_by_height", + // Transactions + "tx_by_loc", + "hash_by_tx_loc", + "tx_loc_by_hash", + // Transparent + "balance_by_transparent_addr", + "tx_loc_by_transparent_addr_loc", + "utxo_by_out_loc", + "utxo_loc_by_transparent_addr_loc", + // Sprout + "sprout_nullifiers", + "sprout_anchors", + "sprout_note_commitment_tree", + // Sapling + "sapling_nullifiers", + "sapling_anchors", + "sapling_note_commitment_tree", + // Orchard + "orchard_nullifiers", + "orchard_anchors", + "orchard_note_commitment_tree", + // Chain + "history_tree", + "tip_chain_value_pool", + ]; + /// Opens or creates the database at `config.path` for `network`, /// and returns a shared low-level database wrapper. pub fn new(config: &Config, network: Network) -> DiskDb { let path = config.db_path(network); + + let running_version = database_format_version_in_code(); + let disk_version = database_format_version_on_disk(config, network) + .expect("unable to read database format version file"); + + match disk_version.as_ref().map(|disk| disk.cmp(&running_version)) { + // TODO: if the on-disk format is older, actually run the upgrade task after the + // database has been opened (#6642) + Some(Ordering::Less) => info!( + ?running_version, + ?disk_version, + "trying to open older database format: launching upgrade task" + ), + // TODO: if the on-disk format is newer, downgrade the version after the + // database has been opened (#6642) + Some(Ordering::Greater) => info!( + ?running_version, + ?disk_version, + "trying to open newer database format: data should be compatible" + ), + Some(Ordering::Equal) => info!( + ?running_version, + "trying to open compatible database format" + ), + None => info!( + ?running_version, + "creating new database with the current format" + ), + } + let db_options = DiskDb::options(); - let column_families = vec![ - // Blocks - rocksdb::ColumnFamilyDescriptor::new("hash_by_height", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("height_by_hash", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("block_header_by_height", db_options.clone()), - // Transactions - rocksdb::ColumnFamilyDescriptor::new("tx_by_loc", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("hash_by_tx_loc", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("tx_loc_by_hash", db_options.clone()), - // Transparent - rocksdb::ColumnFamilyDescriptor::new("balance_by_transparent_addr", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new( - "tx_loc_by_transparent_addr_loc", - db_options.clone(), - ), - rocksdb::ColumnFamilyDescriptor::new("utxo_by_out_loc", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new( - "utxo_loc_by_transparent_addr_loc", - db_options.clone(), - ), - // Sprout - rocksdb::ColumnFamilyDescriptor::new("sprout_nullifiers", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("sprout_anchors", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("sprout_note_commitment_tree", db_options.clone()), - // Sapling - rocksdb::ColumnFamilyDescriptor::new("sapling_nullifiers", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("sapling_anchors", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new( - "sapling_note_commitment_tree", - db_options.clone(), - ), - // Orchard - rocksdb::ColumnFamilyDescriptor::new("orchard_nullifiers", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("orchard_anchors", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new( - "orchard_note_commitment_tree", - db_options.clone(), - ), - // Chain - rocksdb::ColumnFamilyDescriptor::new("history_tree", db_options.clone()), - rocksdb::ColumnFamilyDescriptor::new("tip_chain_value_pool", db_options.clone()), - ]; + // When opening the database in read/write mode, all column families must be opened. + // + // To make Zebra forward-compatible with databases updated by later versions, + // we read any existing column families off the disk, then add any new column families + // from the current implementation. + // + // + let column_families_on_disk = DB::list_cf(&db_options, &path).unwrap_or_default(); + let column_families_in_code = Self::COLUMN_FAMILIES_IN_CODE + .iter() + .map(ToString::to_string); - // TODO: move opening the database to a blocking thread (#2188) - let db_result = rocksdb::DBWithThreadMode::::open_cf_descriptors( - &db_options, - &path, - column_families, - ); + let column_families = column_families_on_disk + .into_iter() + .chain(column_families_in_code) + .unique() + .map(|cf_name| rocksdb::ColumnFamilyDescriptor::new(cf_name, db_options.clone())); + + let db_result = DB::open_cf_descriptors(&db_options, &path, column_families); match db_result { Ok(db) => { @@ -453,6 +490,27 @@ impl DiskDb { db.assert_default_cf_is_empty(); + // Now we've checked that the database format is up-to-date, + // mark it as updated on disk. + // + // # Concurrency + // + // The version must only be updated while RocksDB is holding the database + // directory lock. This prevents multiple Zebra instances corrupting the version + // file. + // + // # TODO + // + // - only update the version at the end of the format upgrade task (#6642) + // - add a note to the format upgrade task code to update the version constants + // whenever the format changes + // - add a test that the format upgrade runs exactly once when: + // 1. if an older cached state format is opened, the format is upgraded, + // then if Zebra is launched again the format is not upgraded + // 2. if the current cached state format is opened, the format is not upgraded + write_database_format_version_to_disk(config, network) + .expect("unable to write database format version file to disk"); + db } diff --git a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs index 1e6b2fdf6..83a4d36f6 100644 --- a/zebra-state/src/service/finalized_state/zebra_db/shielded.rs +++ b/zebra-state/src/service/finalized_state/zebra_db/shielded.rs @@ -107,24 +107,41 @@ impl ZebraDb { None => return Default::default(), }; - let sapling_nct_handle = self.db.cf_handle("sapling_note_commitment_tree").unwrap(); - - self.db - .zs_get(&sapling_nct_handle, &height) - .map(Arc::new) + self.sapling_note_commitment_tree_by_height(&height) .expect("Sapling note commitment tree must exist if there is a finalized tip") } - /// Returns the Sapling note commitment tree matching the given block height. - #[allow(dead_code)] + /// Returns the Sapling note commitment tree matching the given block height, + /// or `None` if the height is above the finalized tip. #[allow(clippy::unwrap_in_result)] pub fn sapling_note_commitment_tree_by_height( &self, height: &Height, ) -> Option> { + let tip_height = self.finalized_tip_height()?; + + // If we're above the tip, searching backwards would always return the tip tree. + // But the correct answer is "we don't know that tree yet". + if *height > tip_height { + return None; + } + let sapling_trees = self.db.cf_handle("sapling_note_commitment_tree").unwrap(); - self.db.zs_get(&sapling_trees, height).map(Arc::new) + // If we know there must be a tree, search backwards for it. + // + // # Compatibility + // + // Allow older Zebra versions to read future database formats, after note commitment trees + // have been deduplicated. See ticket #6642 for details. + let (_first_duplicate_height, tree) = self + .db + .zs_prev_key_value_back_from(&sapling_trees, height) + .expect( + "Sapling note commitment trees must exist for all heights below the finalized tip", + ); + + Some(Arc::new(tree)) } /// Returns the Orchard note commitment tree of the finalized tip @@ -135,24 +152,38 @@ impl ZebraDb { None => return Default::default(), }; - let orchard_nct_handle = self.db.cf_handle("orchard_note_commitment_tree").unwrap(); - - self.db - .zs_get(&orchard_nct_handle, &height) - .map(Arc::new) + self.orchard_note_commitment_tree_by_height(&height) .expect("Orchard note commitment tree must exist if there is a finalized tip") } - /// Returns the Orchard note commitment tree matching the given block height. - #[allow(dead_code)] + /// Returns the Orchard note commitment tree matching the given block height, + /// or `None` if the height is above the finalized tip. #[allow(clippy::unwrap_in_result)] pub fn orchard_note_commitment_tree_by_height( &self, height: &Height, ) -> Option> { + let tip_height = self.finalized_tip_height()?; + + // If we're above the tip, searching backwards would always return the tip tree. + // But the correct answer is "we don't know that tree yet". + if *height > tip_height { + return None; + } + let orchard_trees = self.db.cf_handle("orchard_note_commitment_tree").unwrap(); - self.db.zs_get(&orchard_trees, height).map(Arc::new) + // # Compatibility + // + // Allow older Zebra versions to read future database formats. See ticket #6642 for details. + let (_first_duplicate_height, tree) = self + .db + .zs_prev_key_value_back_from(&orchard_trees, height) + .expect( + "Orchard note commitment trees must exist for all heights below the finalized tip", + ); + + Some(Arc::new(tree)) } /// Returns the shielded note commitment trees of the finalized tip