Disables EAH with short epochs (#28803)
This commit is contained in:
parent
b572422d24
commit
d798e751a0
|
@ -6202,6 +6202,7 @@ dependencies = [
|
|||
"symlink",
|
||||
"tar",
|
||||
"tempfile",
|
||||
"test-case",
|
||||
"thiserror",
|
||||
"zstd",
|
||||
]
|
||||
|
|
|
@ -139,6 +139,7 @@ impl TestEnvironment {
|
|||
assert!(bank
|
||||
.feature_set
|
||||
.is_active(&feature_set::epoch_accounts_hash::id()));
|
||||
assert!(epoch_accounts_hash::is_enabled_this_epoch(&bank));
|
||||
|
||||
bank.set_startup_verification_complete();
|
||||
|
||||
|
|
|
@ -37,7 +37,6 @@ use {
|
|||
},
|
||||
solana_sdk::{
|
||||
clock::Slot,
|
||||
epoch_schedule::EpochSchedule,
|
||||
genesis_config::{
|
||||
ClusterType::{self, Development, Devnet, MainnetBeta, Testnet},
|
||||
GenesisConfig,
|
||||
|
@ -95,8 +94,6 @@ impl SnapshotTestConfig {
|
|||
&solana_sdk::pubkey::new_rand(), // validator_pubkey
|
||||
1, // validator_stake_lamports
|
||||
);
|
||||
// NOTE: Must set `warmup == false` until EAH can handle short epochs
|
||||
genesis_config_info.genesis_config.epoch_schedule = EpochSchedule::without_warmup();
|
||||
genesis_config_info.genesis_config.cluster_type = cluster_type;
|
||||
let bank0 = Bank::new_with_paths_for_tests(
|
||||
&genesis_config_info.genesis_config,
|
||||
|
|
|
@ -179,29 +179,34 @@ well.
|
|||
|
||||
An EAH is requested by `BankForks::set_root()`, which happens while setting
|
||||
*roots*. The EAH is stored into `Bank`s when they are *frozen*. Banks are
|
||||
frozen 32 slots before they are rooted. For the expected behavior, the EAH
|
||||
start slot really should be 32 slots before the stop slot. If the number of
|
||||
slots per epoch is small, this can result in surprising behavior.
|
||||
frozen *at least* 32 slots before they are rooted. For the expected behavior,
|
||||
the EAH start slot really should be 32 slots before the stop slot. If the
|
||||
number of slots per epoch is small, this can result in surprising behavior.
|
||||
|
||||
Example 1: Assume there are 64 slots per epoch. The EAH start offset is 16
|
||||
and the EAH stop offset is 48. The difference is 32. So when Bank 48 is
|
||||
frozen before Bank 16 is rooted, a new EAH request has not yet been requested;
|
||||
the EAH from the previous epoch is still valid and will be used by Bank 48.
|
||||
Example: Assume there are 40 slots per epoch. The EAH start offset is 10, and
|
||||
the EAH stop offset is 30. When Bank 30 is frozen it will include the EAH in
|
||||
its hash. However, Bank 10 has not yet been rooted, so a new EAH has not been
|
||||
calculated for this epoch. This means Bank 30 will have included the EAH *from
|
||||
the previous epoch* in its hash.
|
||||
|
||||
Example 2: Assume there are 66 slots per epoch, then the EAH start offset is
|
||||
still 16 and the EAH stop offset is now 49. The difference is now 33. When
|
||||
Bank 49 is frozen, Bank 16 will already have been rooted, and thus sent an EAH
|
||||
request; Bank 49 will wait for the new EAH calculation to complete.
|
||||
Later, when Bank 10 is rooted, it will request a new EAH be calculated. If a
|
||||
snapshot is taken for Bank 12 (or any bank between 10 and 30), it will include
|
||||
the EAH *from this epoch*. If a node loads the snapshot from Bank 12, once it
|
||||
gets to freezing Bank 30, it will end up with a different bank hash since it
|
||||
included the EAH from this epoch (versus the other node's Bank 30 included the
|
||||
EAH from the previous epoch). Different bank hashes will result in consensus
|
||||
failures.
|
||||
|
||||
Example 3: Assume there are 32 slots per epoch (the minimum allowed). The EAH
|
||||
start offset is 8, and the EAH stop offset is 24. Similar to Example 1, Bank
|
||||
24 is frozen around when Bank 24 *of the previous epoch* is rooted. This
|
||||
ensures that when the EAH is stored, it'll be for the previous epoch.
|
||||
The above example is clearly bad. It can be observed that short epochs only
|
||||
occur (1) during warmup, or (2) in tests. Real clusters have much longer
|
||||
epochs (432,000 slots by default).
|
||||
|
||||
In these examples the observed behavior of the EAH is different than when using
|
||||
the normal 432,000 slots per epoch. The EAH is still valid and correct with a
|
||||
small number of slots per epoch; it now has a delay of one epoch. Since the
|
||||
epochs themselves can be much faster, security is not reduced.
|
||||
Tests can be fixed as needed; that leaves fixing warmup. Since warmup is
|
||||
transient, we disable EAH until slots-per-epoch is large enough. More
|
||||
precisely, we disable EAH until the `calculation window` is big enough. During
|
||||
warmup, slots-per-epoch doubles each epoch until reaching the desired number,
|
||||
so only a few epochs will skip EAH (which also is a small total number of
|
||||
slots).
|
||||
|
||||
|
||||
#### Warping
|
||||
|
|
|
@ -76,6 +76,7 @@ libsecp256k1 = "0.6.0"
|
|||
rand_chacha = "0.2.2"
|
||||
solana-logger = { path = "../logger", version = "=1.15.0" }
|
||||
static_assertions = "1.1.0"
|
||||
test-case = "2.1.0"
|
||||
|
||||
[package.metadata.docs.rs]
|
||||
targets = ["x86_64-unknown-linux-gnu"]
|
||||
|
|
|
@ -6748,6 +6748,10 @@ impl Bank {
|
|||
return false;
|
||||
}
|
||||
|
||||
if !epoch_accounts_hash::is_enabled_this_epoch(self) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let stop_slot = epoch_accounts_hash::calculation_stop(self);
|
||||
self.parent_slot() < stop_slot && self.slot() >= stop_slot
|
||||
}
|
||||
|
@ -7722,15 +7726,15 @@ impl Bank {
|
|||
/// EAH *must* be included. This means if an EAH calculation is currently in-flight we will
|
||||
/// wait for it to complete.
|
||||
pub fn get_epoch_accounts_hash_to_serialize(&self) -> Option<EpochAccountsHash> {
|
||||
let (epoch_accounts_hash, measure) = measure!(
|
||||
epoch_accounts_hash::is_in_calculation_window(self).then(|| {
|
||||
let should_get_epoch_accounts_hash = epoch_accounts_hash::is_enabled_this_epoch(self)
|
||||
&& epoch_accounts_hash::is_in_calculation_window(self);
|
||||
let (epoch_accounts_hash, measure) = measure!(should_get_epoch_accounts_hash.then(|| {
|
||||
self.rc
|
||||
.accounts
|
||||
.accounts_db
|
||||
.epoch_accounts_hash_manager
|
||||
.wait_get_epoch_accounts_hash()
|
||||
})
|
||||
);
|
||||
}));
|
||||
|
||||
datapoint_info!(
|
||||
"bank-get_epoch_accounts_hash_to_serialize",
|
||||
|
|
|
@ -618,6 +618,10 @@ impl BankForks {
|
|||
return false;
|
||||
}
|
||||
|
||||
if !epoch_accounts_hash::is_enabled_this_epoch(bank) {
|
||||
return false;
|
||||
}
|
||||
|
||||
let start_slot = epoch_accounts_hash::calculation_start(bank);
|
||||
bank.slot() > self.last_accounts_hash_slot
|
||||
&& bank.parent_slot() < start_slot
|
||||
|
|
|
@ -2,9 +2,36 @@
|
|||
|
||||
use {
|
||||
crate::bank::Bank,
|
||||
solana_sdk::clock::{Epoch, Slot},
|
||||
solana_sdk::{
|
||||
clock::{Epoch, Slot},
|
||||
vote::state::MAX_LOCKOUT_HISTORY,
|
||||
},
|
||||
};
|
||||
|
||||
/// Is the EAH enabled this Epoch?
|
||||
#[must_use]
|
||||
pub fn is_enabled_this_epoch(bank: &Bank) -> bool {
|
||||
// The EAH calculation "start" is based on when a bank is *rooted*, and "stop" is based on when a
|
||||
// bank is *frozen*. Banks are rooted after exceeding the maximum lockout, so there is a delay
|
||||
// of at least `maximum lockout` number of slots the EAH calculation must take into
|
||||
// consideration. To ensure an EAH calculation has started by the time that calculation is
|
||||
// needed, the calculation interval must be at least `maximum lockout` plus some buffer to
|
||||
// handle when banks are not rooted every single slot.
|
||||
const MINIMUM_CALCULATION_INTERVAL: u64 =
|
||||
(MAX_LOCKOUT_HISTORY as u64).saturating_add(CALCULATION_INTERVAL_BUFFER);
|
||||
// The calculation buffer is a best-attempt at median worst-case for how many bank ancestors can
|
||||
// accumulate before the bank is rooted.
|
||||
// [brooks] On Wed Oct 26 12:15:21 2022, over the previous 6 hour period against mainnet-beta,
|
||||
// I saw multiple validators reporting metrics in the 120s for `total_parent_banks`. The mean
|
||||
// is 2 to 3, but a number of nodes also reported values in the low 20s. A value of 150 should
|
||||
// capture the majority of validators, and will not be an issue for clusters running with
|
||||
// normal slots-per-epoch; this really will only affect tests and epoch schedule warmup.
|
||||
const CALCULATION_INTERVAL_BUFFER: u64 = 150;
|
||||
|
||||
let calculation_interval = calculation_interval(bank);
|
||||
calculation_interval >= MINIMUM_CALCULATION_INTERVAL
|
||||
}
|
||||
|
||||
/// Calculation of the EAH occurs once per epoch. All nodes in the cluster must agree on which
|
||||
/// slot the EAH is based on. This slot will be at an offset into the epoch, and referred to as
|
||||
/// the "start" slot for the EAH calculation.
|
||||
|
@ -38,13 +65,19 @@ pub fn calculation_stop(bank: &Bank) -> Slot {
|
|||
calculation_info(bank).calculation_stop
|
||||
}
|
||||
|
||||
/// Is this bank in the calculation window?
|
||||
/// Get the number of slots from EAH calculation start to stop; known as the calculation interval
|
||||
#[must_use]
|
||||
#[inline]
|
||||
pub fn calculation_interval(bank: &Bank) -> u64 {
|
||||
calculation_info(bank).calculation_interval
|
||||
}
|
||||
|
||||
/// Is this bank in the calculation window?
|
||||
#[must_use]
|
||||
pub fn is_in_calculation_window(bank: &Bank) -> bool {
|
||||
let bank_slot = bank.slot();
|
||||
let info = calculation_info(bank);
|
||||
bank_slot >= info.calculation_start && bank_slot < info.calculation_stop
|
||||
let range = info.calculation_start..info.calculation_stop;
|
||||
range.contains(&bank.slot())
|
||||
}
|
||||
|
||||
/// For the epoch that `bank` is in, get all the EAH calculation information
|
||||
|
@ -60,6 +93,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo {
|
|||
let last_slot_in_epoch = epoch_schedule.get_last_slot_in_epoch(epoch);
|
||||
let calculation_start = first_slot_in_epoch.saturating_add(calculation_offset_start);
|
||||
let calculation_stop = first_slot_in_epoch.saturating_add(calculation_offset_stop);
|
||||
let calculation_interval = calculation_offset_stop.saturating_sub(calculation_offset_start);
|
||||
|
||||
CalculationInfo {
|
||||
epoch,
|
||||
|
@ -70,6 +104,7 @@ pub fn calculation_info(bank: &Bank) -> CalculationInfo {
|
|||
calculation_offset_stop,
|
||||
calculation_start,
|
||||
calculation_stop,
|
||||
calculation_interval,
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -103,6 +138,8 @@ pub struct CalculationInfo {
|
|||
pub calculation_start: Slot,
|
||||
/// Absolute slot where the EAH calculation stops
|
||||
pub calculation_stop: Slot,
|
||||
/// Number of slots from EAH calculation start to stop
|
||||
pub calculation_interval: u64,
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
@ -110,8 +147,23 @@ mod tests {
|
|||
use {
|
||||
super::*,
|
||||
solana_sdk::{epoch_schedule::EpochSchedule, genesis_config::GenesisConfig},
|
||||
test_case::test_case,
|
||||
};
|
||||
|
||||
#[test_case( 32 => false)] // minimum slots per epoch
|
||||
#[test_case( 361 => false)] // below minimum slots per epoch *for EAH*
|
||||
#[test_case( 362 => false)] // minimum slots per epoch *for EAH*
|
||||
#[test_case( 8_192 => true)] // default dev slots per epoch
|
||||
#[test_case(432_000 => true)] // default slots per epoch
|
||||
fn test_is_enabled_this_epoch(slots_per_epoch: u64) -> bool {
|
||||
let genesis_config = GenesisConfig {
|
||||
epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, false),
|
||||
..GenesisConfig::default()
|
||||
};
|
||||
let bank = Bank::new_for_tests(&genesis_config);
|
||||
is_enabled_this_epoch(&bank)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_calculation_offset_bounds() {
|
||||
let bank = Bank::default_for_tests();
|
||||
|
@ -130,7 +182,7 @@ mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_calculation_info() {
|
||||
for slots_per_epoch in [32, 100, 65_536, 432_000, 123_456_789] {
|
||||
for slots_per_epoch in [32, 361, 362, 8_192, 65_536, 432_000, 123_456_789] {
|
||||
for warmup in [false, true] {
|
||||
let genesis_config = GenesisConfig {
|
||||
epoch_schedule: EpochSchedule::custom(slots_per_epoch, slots_per_epoch, warmup),
|
||||
|
@ -140,9 +192,10 @@ mod tests {
|
|||
assert!(info.calculation_offset_start < info.calculation_offset_stop);
|
||||
assert!(info.calculation_offset_start < info.slots_per_epoch);
|
||||
assert!(info.calculation_offset_stop < info.slots_per_epoch);
|
||||
assert!(info.calculation_start < info.calculation_stop,);
|
||||
assert!(info.calculation_start > info.first_slot_in_epoch,);
|
||||
assert!(info.calculation_stop < info.last_slot_in_epoch,);
|
||||
assert!(info.calculation_start < info.calculation_stop);
|
||||
assert!(info.calculation_start > info.first_slot_in_epoch);
|
||||
assert!(info.calculation_stop < info.last_slot_in_epoch);
|
||||
assert!(info.calculation_interval > 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -223,7 +223,8 @@ fn test_bank_serialize_style(
|
|||
initial_epoch_accounts_hash: bool,
|
||||
) {
|
||||
solana_logger::setup();
|
||||
let (genesis_config, _) = create_genesis_config(500);
|
||||
let (mut genesis_config, _) = create_genesis_config(500);
|
||||
genesis_config.epoch_schedule = EpochSchedule::custom(400, 400, false);
|
||||
let bank0 = Arc::new(Bank::new_for_tests(&genesis_config));
|
||||
let eah_start_slot = epoch_accounts_hash::calculation_start(&bank0);
|
||||
let bank1 = Bank::new_from_parent(&bank0, &Pubkey::default(), 1);
|
||||
|
|
Loading…
Reference in New Issue