From d798e751a043063277d6d074d6c5afe6c4f3b6cc Mon Sep 17 00:00:00 2001 From: Brooks Prumo Date: Tue, 15 Nov 2022 12:26:19 -0600 Subject: [PATCH] Disables EAH with short epochs (#28803) --- Cargo.lock | 1 + core/tests/epoch_accounts_hash.rs | 1 + core/tests/snapshots.rs | 3 - docs/src/proposals/epoch_accounts_hash.md | 43 +++++++------- runtime/Cargo.toml | 1 + runtime/src/bank.rs | 22 +++++--- runtime/src/bank_forks.rs | 4 ++ runtime/src/epoch_accounts_hash/utils.rs | 69 ++++++++++++++++++++--- runtime/src/serde_snapshot/tests.rs | 3 +- 9 files changed, 107 insertions(+), 40 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a8e5863d6..b36f1b374 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6202,6 +6202,7 @@ dependencies = [ "symlink", "tar", "tempfile", + "test-case", "thiserror", "zstd", ] diff --git a/core/tests/epoch_accounts_hash.rs b/core/tests/epoch_accounts_hash.rs index 3c1545399..4ecc5885d 100755 --- a/core/tests/epoch_accounts_hash.rs +++ b/core/tests/epoch_accounts_hash.rs @@ -139,6 +139,7 @@ impl TestEnvironment { assert!(bank .feature_set .is_active(&feature_set::epoch_accounts_hash::id())); + assert!(epoch_accounts_hash::is_enabled_this_epoch(&bank)); bank.set_startup_verification_complete(); diff --git a/core/tests/snapshots.rs b/core/tests/snapshots.rs index 3f1a50c5a..69764fdb9 100644 --- a/core/tests/snapshots.rs +++ b/core/tests/snapshots.rs @@ -37,7 +37,6 @@ use { }, solana_sdk::{ clock::Slot, - epoch_schedule::EpochSchedule, genesis_config::{ ClusterType::{self, Development, Devnet, MainnetBeta, Testnet}, GenesisConfig, @@ -95,8 +94,6 @@ impl SnapshotTestConfig { &solana_sdk::pubkey::new_rand(), // validator_pubkey 1, // validator_stake_lamports ); - // NOTE: Must set `warmup == false` until EAH can handle short epochs - genesis_config_info.genesis_config.epoch_schedule = EpochSchedule::without_warmup(); genesis_config_info.genesis_config.cluster_type = cluster_type; let bank0 = Bank::new_with_paths_for_tests( &genesis_config_info.genesis_config, diff --git a/docs/src/proposals/epoch_accounts_hash.md b/docs/src/proposals/epoch_accounts_hash.md index 56817e7c6..0d9dc86b3 100644 --- a/docs/src/proposals/epoch_accounts_hash.md +++ b/docs/src/proposals/epoch_accounts_hash.md @@ -179,29 +179,34 @@ well. An EAH is requested by `BankForks::set_root()`, which happens while setting *roots*. The EAH is stored into `Bank`s when they are *frozen*. Banks are -frozen 32 slots before they are rooted. For the expected behavior, the EAH -start slot really should be 32 slots before the stop slot. If the number of -slots per epoch is small, this can result in surprising behavior. +frozen *at least* 32 slots before they are rooted. For the expected behavior, +the EAH start slot really should be 32 slots before the stop slot. If the +number of slots per epoch is small, this can result in surprising behavior. -Example 1: Assume there are 64 slots per epoch. The EAH start offset is 16 -and the EAH stop offset is 48. The difference is 32. So when Bank 48 is -frozen before Bank 16 is rooted, a new EAH request has not yet been requested; -the EAH from the previous epoch is still valid and will be used by Bank 48. +Example: Assume there are 40 slots per epoch. The EAH start offset is 10, and +the EAH stop offset is 30. When Bank 30 is frozen it will include the EAH in +its hash. However, Bank 10 has not yet been rooted, so a new EAH has not been +calculated for this epoch. This means Bank 30 will have included the EAH *from +the previous epoch* in its hash. -Example 2: Assume there are 66 slots per epoch, then the EAH start offset is -still 16 and the EAH stop offset is now 49. The difference is now 33. When -Bank 49 is frozen, Bank 16 will already have been rooted, and thus sent an EAH -request; Bank 49 will wait for the new EAH calculation to complete. +Later, when Bank 10 is rooted, it will request a new EAH be calculated. If a +snapshot is taken for Bank 12 (or any bank between 10 and 30), it will include +the EAH *from this epoch*. If a node loads the snapshot from Bank 12, once it +gets to freezing Bank 30, it will end up with a different bank hash since it +included the EAH from this epoch (versus the other node's Bank 30 included the +EAH from the previous epoch). Different bank hashes will result in consensus +failures. -Example 3: Assume there are 32 slots per epoch (the minimum allowed). The EAH -start offset is 8, and the EAH stop offset is 24. Similar to Example 1, Bank -24 is frozen around when Bank 24 *of the previous epoch* is rooted. This -ensures that when the EAH is stored, it'll be for the previous epoch. +The above example is clearly bad. It can be observed that short epochs only +occur (1) during warmup, or (2) in tests. Real clusters have much longer +epochs (432,000 slots by default). -In these examples the observed behavior of the EAH is different than when using -the normal 432,000 slots per epoch. The EAH is still valid and correct with a -small number of slots per epoch; it now has a delay of one epoch. Since the -epochs themselves can be much faster, security is not reduced. +Tests can be fixed as needed; that leaves fixing warmup. Since warmup is +transient, we disable EAH until slots-per-epoch is large enough. More +precisely, we disable EAH until the `calculation window` is big enough. During +warmup, slots-per-epoch doubles each epoch until reaching the desired number, +so only a few epochs will skip EAH (which also is a small total number of +slots). #### Warping diff --git a/runtime/Cargo.toml b/runtime/Cargo.toml index 54a8cc34c..c75e0ec37 100644 --- a/runtime/Cargo.toml +++ b/runtime/Cargo.toml @@ -76,6 +76,7 @@ libsecp256k1 = "0.6.0" rand_chacha = "0.2.2" solana-logger = { path = "../logger", version = "=1.15.0" } static_assertions = "1.1.0" +test-case = "2.1.0" [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu"] diff --git a/runtime/src/bank.rs b/runtime/src/bank.rs index aebce8b90..1c38e965e 100644 --- a/runtime/src/bank.rs +++ b/runtime/src/bank.rs @@ -6748,6 +6748,10 @@ impl Bank { return false; } + if !epoch_accounts_hash::is_enabled_this_epoch(self) { + return false; + } + let stop_slot = epoch_accounts_hash::calculation_stop(self); self.parent_slot() < stop_slot && self.slot() >= stop_slot } @@ -7722,15 +7726,15 @@ impl Bank { /// EAH *must* be included. This means if an EAH calculation is currently in-flight we will /// wait for it to complete. pub fn get_epoch_accounts_hash_to_serialize(&self) -> Option { - let (epoch_accounts_hash, measure) = measure!( - epoch_accounts_hash::is_in_calculation_window(self).then(|| { - self.rc - .accounts - .accounts_db - .epoch_accounts_hash_manager - .wait_get_epoch_accounts_hash() - }) - ); + let should_get_epoch_accounts_hash = epoch_accounts_hash::is_enabled_this_epoch(self) + && epoch_accounts_hash::is_in_calculation_window(self); + let (epoch_accounts_hash, measure) = measure!(should_get_epoch_accounts_hash.then(|| { + self.rc + .accounts + .accounts_db + .epoch_accounts_hash_manager + .wait_get_epoch_accounts_hash() + })); datapoint_info!( "bank-get_epoch_accounts_hash_to_serialize", diff --git a/runtime/src/bank_forks.rs b/runtime/src/bank_forks.rs index bf1c4d609..4ed60cc11 100644 --- a/runtime/src/bank_forks.rs +++ b/runtime/src/bank_forks.rs @@ -618,6 +618,10 @@ impl BankForks { return false; } + if !epoch_accounts_hash::is_enabled_this_epoch(bank) { + return false; + } + let start_slot = epoch_accounts_hash::calculation_start(bank); bank.slot() > self.last_accounts_hash_slot && bank.parent_slot() < start_slot diff --git a/runtime/src/epoch_accounts_hash/utils.rs b/runtime/src/epoch_accounts_hash/utils.rs index b7b2d508e..40e307615 100644 --- a/runtime/src/epoch_accounts_hash/utils.rs +++ b/runtime/src/epoch_accounts_hash/utils.rs @@ -2,9 +2,36 @@ use { crate::bank::Bank, - solana_sdk::clock::{Epoch, Slot}, + solana_sdk::{ + clock::{Epoch, Slot}, + vote::state::MAX_LOCKOUT_HISTORY, + }, }; +/// Is the EAH enabled this Epoch? +#[must_use] +pub fn is_enabled_this_epoch(bank: &Bank) -> bool { + // The EAH calculation "start" is based on when a bank is *rooted*, and "stop" is based on when a + // bank is *frozen*. Banks are rooted after exceeding the maximum lockout, so there is a delay + // of at least `maximum lockout` number of slots the EAH calculation must take into + // consideration. To ensure an EAH calculation has started by the time that calculation is + // needed, the calculation interval must be at least `maximum lockout` plus some buffer to + // handle when banks are not rooted every single slot. + const MINIMUM_CALCULATION_INTERVAL: u64 = + (MAX_LOCKOUT_HISTORY as u64).saturating_add(CALCULATION_INTERVAL_BUFFER); + // The calculation buffer is a best-attempt at median worst-case for how many bank ancestors can + // accumulate before the bank is rooted. + // [brooks] On Wed Oct 26 12:15:21 2022, over the previous 6 hour period against mainnet-beta, + // I saw multiple validators reporting metrics in the 120s for `total_parent_banks`. The mean + // is 2 to 3, but a number of nodes also reported values in the low 20s. A value of 150 should + // capture the majority of validators, and will not be an issue for clusters running with + // normal slots-per-epoch; this really will only affect tests and epoch schedule warmup. + const CALCULATION_INTERVAL_BUFFER: u64 = 150; + + let calculation_interval = calculation_interval(bank); + calculation_interval >= MINIMUM_CALCULATION_INTERVAL +} + /// Calculation of the EAH occurs once per epoch. All nodes in the cluster must agree on which /// slot the EAH is based on. This slot will be at an offset into the epoch, and referred to as /// the "start" slot for the EAH calculation. @@ -38,13 +65,19 @@ pub fn calculation_stop(bank: &Bank) -> Slot { calculation_info(bank).calculation_stop } -/// Is this bank in the calculation window? +/// Get the number of slots from EAH calculation start to stop; known as the calculation interval #[must_use] #[inline] +pub fn calculation_interval(bank: &Bank) -> u64 { + calculation_info(bank).calculation_interval +} + +/// Is this bank in the calculation window? +#[must_use] pub fn is_in_calculation_window(bank: &Bank) -> bool { - let bank_slot = bank.slot(); let info = calculation_info(bank); - bank_slot >= info.calculation_start && bank_slot < info.calculation_stop + let range = info.calculation_start..info.calculation_stop; + range.contains(&bank.slot()) } /// For the epoch that `bank` is in, get all the EAH calculation information @@ -60,6 +93,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo { let last_slot_in_epoch = epoch_schedule.get_last_slot_in_epoch(epoch); let calculation_start = first_slot_in_epoch.saturating_add(calculation_offset_start); let calculation_stop = first_slot_in_epoch.saturating_add(calculation_offset_stop); + let calculation_interval = calculation_offset_stop.saturating_sub(calculation_offset_start); CalculationInfo { epoch, @@ -70,6 +104,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo { calculation_offset_stop, calculation_start, calculation_stop, + calculation_interval, } } @@ -103,6 +138,8 @@ pub struct CalculationInfo { pub calculation_start: Slot, /// Absolute slot where the EAH calculation stops pub calculation_stop: Slot, + /// Number of slots from EAH calculation start to stop + pub calculation_interval: u64, } #[cfg(test)] @@ -110,8 +147,23 @@ mod tests { use { super::*, solana_sdk::{epoch_schedule::EpochSchedule, genesis_config::GenesisConfig}, + test_case::test_case, }; + #[test_case( 32 => false)] // minimum slots per epoch + #[test_case( 361 => false)] // below minimum slots per epoch *for EAH* + #[test_case( 362 => false)] // minimum slots per epoch *for EAH* + #[test_case( 8_192 => true)] // default dev slots per epoch + #[test_case(432_000 => true)] // default slots per epoch + fn test_is_enabled_this_epoch(slots_per_epoch: u64) -> bool { + let genesis_config = GenesisConfig { + epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, false), + ..GenesisConfig::default() + }; + let bank = Bank::new_for_tests(&genesis_config); + is_enabled_this_epoch(&bank) + } + #[test] fn test_calculation_offset_bounds() { let bank = Bank::default_for_tests(); @@ -130,7 +182,7 @@ mod tests { #[test] fn test_calculation_info() { - for slots_per_epoch in [32, 100, 65_536, 432_000, 123_456_789] { + for slots_per_epoch in [32, 361, 362, 8_192, 65_536, 432_000, 123_456_789] { for warmup in [false, true] { let genesis_config = GenesisConfig { epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, warmup), @@ -140,9 +192,10 @@ mod tests { assert!(info.calculation_offset_start < info.calculation_offset_stop); assert!(info.calculation_offset_start < info.slots_per_epoch); assert!(info.calculation_offset_stop < info.slots_per_epoch); - assert!(info.calculation_start < info.calculation_stop,); - assert!(info.calculation_start > info.first_slot_in_epoch,); - assert!(info.calculation_stop < info.last_slot_in_epoch,); + assert!(info.calculation_start < info.calculation_stop); + assert!(info.calculation_start > info.first_slot_in_epoch); + assert!(info.calculation_stop < info.last_slot_in_epoch); + assert!(info.calculation_interval > 0); } } } diff --git a/runtime/src/serde_snapshot/tests.rs b/runtime/src/serde_snapshot/tests.rs index 4063af0c3..a4b9d140f 100644 --- a/runtime/src/serde_snapshot/tests.rs +++ b/runtime/src/serde_snapshot/tests.rs @@ -223,7 +223,8 @@ fn test_bank_serialize_style( initial_epoch_accounts_hash: bool, ) { solana_logger::setup(); - let (genesis_config, _) = create_genesis_config(500); + let (mut genesis_config, _) = create_genesis_config(500); + genesis_config.epoch_schedule = EpochSchedule::custom(400, 400, false); let bank0 = Arc::new(Bank::new_for_tests(&genesis_config)); let eah_start_slot = epoch_accounts_hash::calculation_start(&bank0); let bank1 = Bank::new_from_parent(&bank0, &Pubkey::default(), 1);