Disables EAH with short epochs (#28803)

This commit is contained in:
Brooks Prumo 2022-11-15 12:26:19 -06:00 committed by GitHub
parent b572422d24
commit d798e751a0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 107 additions and 40 deletions

1
Cargo.lock generated
View File

@ -6202,6 +6202,7 @@ dependencies = [
"symlink",
"tar",
"tempfile",
"test-case",
"thiserror",
"zstd",
]

View File

@ -139,6 +139,7 @@ impl TestEnvironment {
assert!(bank
.feature_set
.is_active(&feature_set::epoch_accounts_hash::id()));
assert!(epoch_accounts_hash::is_enabled_this_epoch(&bank));
bank.set_startup_verification_complete();

View File

@ -37,7 +37,6 @@ use {
},
solana_sdk::{
clock::Slot,
epoch_schedule::EpochSchedule,
genesis_config::{
ClusterType::{self, Development, Devnet, MainnetBeta, Testnet},
GenesisConfig,
@ -95,8 +94,6 @@ impl SnapshotTestConfig {
&solana_sdk::pubkey::new_rand(), // validator_pubkey
1, // validator_stake_lamports
);
// NOTE: Must set `warmup == false` until EAH can handle short epochs
genesis_config_info.genesis_config.epoch_schedule = EpochSchedule::without_warmup();
genesis_config_info.genesis_config.cluster_type = cluster_type;
let bank0 = Bank::new_with_paths_for_tests(
&genesis_config_info.genesis_config,

View File

@ -179,29 +179,34 @@ well.
An EAH is requested by `BankForks::set_root()`, which happens while setting
*roots*. The EAH is stored into `Bank`s when they are *frozen*. Banks are
frozen 32 slots before they are rooted. For the expected behavior, the EAH
start slot really should be 32 slots before the stop slot. If the number of
slots per epoch is small, this can result in surprising behavior.
frozen *at least* 32 slots before they are rooted. For the expected behavior,
the EAH start slot really should be 32 slots before the stop slot. If the
number of slots per epoch is small, this can result in surprising behavior.
Example 1: Assume there are 64 slots per epoch. The EAH start offset is 16
and the EAH stop offset is 48. The difference is 32. So when Bank 48 is
frozen before Bank 16 is rooted, a new EAH request has not yet been requested;
the EAH from the previous epoch is still valid and will be used by Bank 48.
Example: Assume there are 40 slots per epoch. The EAH start offset is 10, and
the EAH stop offset is 30. When Bank 30 is frozen it will include the EAH in
its hash. However, Bank 10 has not yet been rooted, so a new EAH has not been
calculated for this epoch. This means Bank 30 will have included the EAH *from
the previous epoch* in its hash.
Example 2: Assume there are 66 slots per epoch, then the EAH start offset is
still 16 and the EAH stop offset is now 49. The difference is now 33. When
Bank 49 is frozen, Bank 16 will already have been rooted, and thus sent an EAH
request; Bank 49 will wait for the new EAH calculation to complete.
Later, when Bank 10 is rooted, it will request a new EAH be calculated. If a
snapshot is taken for Bank 12 (or any bank between 10 and 30), it will include
the EAH *from this epoch*. If a node loads the snapshot from Bank 12, once it
gets to freezing Bank 30, it will end up with a different bank hash since it
included the EAH from this epoch (versus the other node's Bank 30 included the
EAH from the previous epoch). Different bank hashes will result in consensus
failures.
Example 3: Assume there are 32 slots per epoch (the minimum allowed). The EAH
start offset is 8, and the EAH stop offset is 24. Similar to Example 1, Bank
24 is frozen around when Bank 24 *of the previous epoch* is rooted. This
ensures that when the EAH is stored, it'll be for the previous epoch.
The above example is clearly bad. It can be observed that short epochs only
occur (1) during warmup, or (2) in tests. Real clusters have much longer
epochs (432,000 slots by default).
In these examples the observed behavior of the EAH is different than when using
the normal 432,000 slots per epoch. The EAH is still valid and correct with a
small number of slots per epoch; it now has a delay of one epoch. Since the
epochs themselves can be much faster, security is not reduced.
Tests can be fixed as needed; that leaves fixing warmup. Since warmup is
transient, we disable EAH until slots-per-epoch is large enough. More
precisely, we disable EAH until the `calculation window` is big enough. During
warmup, slots-per-epoch doubles each epoch until reaching the desired number,
so only a few epochs will skip EAH (which also is a small total number of
slots).
#### Warping

View File

@ -76,6 +76,7 @@ libsecp256k1 = "0.6.0"
rand_chacha = "0.2.2"
solana-logger = { path = "../logger", version = "=1.15.0" }
static_assertions = "1.1.0"
test-case = "2.1.0"
[package.metadata.docs.rs]
targets = ["x86_64-unknown-linux-gnu"]

View File

@ -6748,6 +6748,10 @@ impl Bank {
return false;
}
if !epoch_accounts_hash::is_enabled_this_epoch(self) {
return false;
}
let stop_slot = epoch_accounts_hash::calculation_stop(self);
self.parent_slot() < stop_slot && self.slot() >= stop_slot
}
@ -7722,15 +7726,15 @@ impl Bank {
/// EAH *must* be included. This means if an EAH calculation is currently in-flight we will
/// wait for it to complete.
pub fn get_epoch_accounts_hash_to_serialize(&self) -> Option<EpochAccountsHash> {
let (epoch_accounts_hash, measure) = measure!(
epoch_accounts_hash::is_in_calculation_window(self).then(|| {
self.rc
.accounts
.accounts_db
.epoch_accounts_hash_manager
.wait_get_epoch_accounts_hash()
})
);
let should_get_epoch_accounts_hash = epoch_accounts_hash::is_enabled_this_epoch(self)
&& epoch_accounts_hash::is_in_calculation_window(self);
let (epoch_accounts_hash, measure) = measure!(should_get_epoch_accounts_hash.then(|| {
self.rc
.accounts
.accounts_db
.epoch_accounts_hash_manager
.wait_get_epoch_accounts_hash()
}));
datapoint_info!(
"bank-get_epoch_accounts_hash_to_serialize",

View File

@ -618,6 +618,10 @@ impl BankForks {
return false;
}
if !epoch_accounts_hash::is_enabled_this_epoch(bank) {
return false;
}
let start_slot = epoch_accounts_hash::calculation_start(bank);
bank.slot() > self.last_accounts_hash_slot
&& bank.parent_slot() < start_slot

View File

@ -2,9 +2,36 @@
use {
crate::bank::Bank,
solana_sdk::clock::{Epoch, Slot},
solana_sdk::{
clock::{Epoch, Slot},
vote::state::MAX_LOCKOUT_HISTORY,
},
};
/// Is the EAH enabled this Epoch?
#[must_use]
pub fn is_enabled_this_epoch(bank: &Bank) -> bool {
// The EAH calculation "start" is based on when a bank is *rooted*, and "stop" is based on when a
// bank is *frozen*. Banks are rooted after exceeding the maximum lockout, so there is a delay
// of at least `maximum lockout` number of slots the EAH calculation must take into
// consideration. To ensure an EAH calculation has started by the time that calculation is
// needed, the calculation interval must be at least `maximum lockout` plus some buffer to
// handle when banks are not rooted every single slot.
const MINIMUM_CALCULATION_INTERVAL: u64 =
(MAX_LOCKOUT_HISTORY as u64).saturating_add(CALCULATION_INTERVAL_BUFFER);
// The calculation buffer is a best-attempt at median worst-case for how many bank ancestors can
// accumulate before the bank is rooted.
// [brooks] On Wed Oct 26 12:15:21 2022, over the previous 6 hour period against mainnet-beta,
// I saw multiple validators reporting metrics in the 120s for `total_parent_banks`. The mean
// is 2 to 3, but a number of nodes also reported values in the low 20s. A value of 150 should
// capture the majority of validators, and will not be an issue for clusters running with
// normal slots-per-epoch; this really will only affect tests and epoch schedule warmup.
const CALCULATION_INTERVAL_BUFFER: u64 = 150;
let calculation_interval = calculation_interval(bank);
calculation_interval >= MINIMUM_CALCULATION_INTERVAL
}
/// Calculation of the EAH occurs once per epoch. All nodes in the cluster must agree on which
/// slot the EAH is based on. This slot will be at an offset into the epoch, and referred to as
/// the "start" slot for the EAH calculation.
@ -38,13 +65,19 @@ pub fn calculation_stop(bank: &Bank) -> Slot {
calculation_info(bank).calculation_stop
}
/// Is this bank in the calculation window?
/// Get the number of slots from EAH calculation start to stop; known as the calculation interval
#[must_use]
#[inline]
pub fn calculation_interval(bank: &Bank) -> u64 {
calculation_info(bank).calculation_interval
}
/// Is this bank in the calculation window?
#[must_use]
pub fn is_in_calculation_window(bank: &Bank) -> bool {
let bank_slot = bank.slot();
let info = calculation_info(bank);
bank_slot >= info.calculation_start && bank_slot < info.calculation_stop
let range = info.calculation_start..info.calculation_stop;
range.contains(&bank.slot())
}
/// For the epoch that `bank` is in, get all the EAH calculation information
@ -60,6 +93,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo {
let last_slot_in_epoch = epoch_schedule.get_last_slot_in_epoch(epoch);
let calculation_start = first_slot_in_epoch.saturating_add(calculation_offset_start);
let calculation_stop = first_slot_in_epoch.saturating_add(calculation_offset_stop);
let calculation_interval = calculation_offset_stop.saturating_sub(calculation_offset_start);
CalculationInfo {
epoch,
@ -70,6 +104,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo {
calculation_offset_stop,
calculation_start,
calculation_stop,
calculation_interval,
}
}
@ -103,6 +138,8 @@ pub struct CalculationInfo {
pub calculation_start: Slot,
/// Absolute slot where the EAH calculation stops
pub calculation_stop: Slot,
/// Number of slots from EAH calculation start to stop
pub calculation_interval: u64,
}
#[cfg(test)]
@ -110,8 +147,23 @@ mod tests {
use {
super::*,
solana_sdk::{epoch_schedule::EpochSchedule, genesis_config::GenesisConfig},
test_case::test_case,
};
#[test_case( 32 => false)] // minimum slots per epoch
#[test_case( 361 => false)] // below minimum slots per epoch *for EAH*
#[test_case( 362 => false)] // minimum slots per epoch *for EAH*
#[test_case( 8_192 => true)] // default dev slots per epoch
#[test_case(432_000 => true)] // default slots per epoch
fn test_is_enabled_this_epoch(slots_per_epoch: u64) -> bool {
let genesis_config = GenesisConfig {
epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, false),
..GenesisConfig::default()
};
let bank = Bank::new_for_tests(&genesis_config);
is_enabled_this_epoch(&bank)
}
#[test]
fn test_calculation_offset_bounds() {
let bank = Bank::default_for_tests();
@ -130,7 +182,7 @@ mod tests {
#[test]
fn test_calculation_info() {
for slots_per_epoch in [32, 100, 65_536, 432_000, 123_456_789] {
for slots_per_epoch in [32, 361, 362, 8_192, 65_536, 432_000, 123_456_789] {
for warmup in [false, true] {
let genesis_config = GenesisConfig {
epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, warmup),
@ -140,9 +192,10 @@ mod tests {
assert!(info.calculation_offset_start < info.calculation_offset_stop);
assert!(info.calculation_offset_start < info.slots_per_epoch);
assert!(info.calculation_offset_stop < info.slots_per_epoch);
assert!(info.calculation_start < info.calculation_stop,);
assert!(info.calculation_start > info.first_slot_in_epoch,);
assert!(info.calculation_stop < info.last_slot_in_epoch,);
assert!(info.calculation_start < info.calculation_stop);
assert!(info.calculation_start > info.first_slot_in_epoch);
assert!(info.calculation_stop < info.last_slot_in_epoch);
assert!(info.calculation_interval > 0);
}
}
}

View File

@ -223,7 +223,8 @@ fn test_bank_serialize_style(
initial_epoch_accounts_hash: bool,
) {
solana_logger::setup();
let (genesis_config, _) = create_genesis_config(500);
let (mut genesis_config, _) = create_genesis_config(500);
genesis_config.epoch_schedule = EpochSchedule::custom(400, 400, false);
let bank0 = Arc::new(Bank::new_for_tests(&genesis_config));
let eah_start_slot = epoch_accounts_hash::calculation_start(&bank0);
let bank1 = Bank::new_from_parent(&bank0, &Pubkey::default(), 1);