at startup, keep duplicates in in-memory index since they will be cleaned shortly (#30736)

at startup, keep duplicates in in-memory index since they will be cleaned soon
This commit is contained in:
Jeff Washington (jwash) 2023-03-22 10:33:39 -05:00 committed by GitHub
parent 81ef2a0d75
commit 9a1d5ea95d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 34 additions and 25 deletions

View File

@ -9115,7 +9115,7 @@ impl AccountsDb {
// get duplicate keys from acct idx. We have to wait until we've finished flushing. // get duplicate keys from acct idx. We have to wait until we've finished flushing.
for (slot, key) in self for (slot, key) in self
.accounts_index .accounts_index
.retrieve_duplicate_keys_from_startup() .populate_and_retrieve_duplicate_keys_from_startup()
.into_iter() .into_iter()
.flatten() .flatten()
{ {

View File

@ -1685,11 +1685,14 @@ impl<T: IndexValue, U: DiskIndexValue + From<T> + Into<T>> AccountsIndex<T, U> {
} }
/// return Vec<Vec<>> because the internal vecs are already allocated per bin /// return Vec<Vec<>> because the internal vecs are already allocated per bin
pub fn retrieve_duplicate_keys_from_startup(&self) -> Vec<Vec<(Slot, Pubkey)>> { pub(crate) fn populate_and_retrieve_duplicate_keys_from_startup(
&self,
) -> Vec<Vec<(Slot, Pubkey)>> {
(0..self.bins()) (0..self.bins())
.into_par_iter()
.map(|pubkey_bin| { .map(|pubkey_bin| {
let r_account_maps = &self.account_maps[pubkey_bin]; let r_account_maps = &self.account_maps[pubkey_bin];
r_account_maps.retrieve_duplicate_keys_from_startup() r_account_maps.populate_and_retrieve_duplicate_keys_from_startup()
}) })
.collect() .collect()
} }
@ -2699,6 +2702,7 @@ pub mod tests {
index.set_startup(Startup::Normal); index.set_startup(Startup::Normal);
} }
assert!(gc.is_empty()); assert!(gc.is_empty());
index.populate_and_retrieve_duplicate_keys_from_startup();
for lock in &[false, true] { for lock in &[false, true] {
let read_lock = if *lock { let read_lock = if *lock {

View File

@ -13,7 +13,7 @@ use {
solana_measure::measure::Measure, solana_measure::measure::Measure,
solana_sdk::{clock::Slot, pubkey::Pubkey}, solana_sdk::{clock::Slot, pubkey::Pubkey},
std::{ std::{
collections::{hash_map::Entry, HashMap}, collections::{hash_map::Entry, HashMap, HashSet},
fmt::Debug, fmt::Debug,
ops::{Bound, RangeBounds, RangeInclusive}, ops::{Bound, RangeBounds, RangeInclusive},
sync::{ sync::{
@ -128,8 +128,11 @@ pub enum InsertNewEntryResults {
struct StartupInfo<T: IndexValue> { struct StartupInfo<T: IndexValue> {
/// entries to add next time we are flushing to disk /// entries to add next time we are flushing to disk
insert: Vec<(Slot, Pubkey, T)>, insert: Vec<(Slot, Pubkey, T)>,
/// pubkeys that were found to have duplicate index entries /// entries that were found to have duplicate index entries.
duplicates: Vec<(Slot, Pubkey)>, /// When all entries have been inserted, these can be resolved and held in memory.
duplicates: Vec<(Slot, Pubkey, T)>,
/// pubkeys that were already added to disk and later found to be duplicates,
duplicates_put_on_disk: HashSet<(Slot, Pubkey)>,
} }
#[derive(Default, Debug)] #[derive(Default, Debug)]
@ -1040,7 +1043,8 @@ impl<T: IndexValue, U: DiskIndexValue + From<T> + Into<T>> InMemAccountsIndex<T,
); );
drop(map_internal); drop(map_internal);
let mut duplicates = vec![]; // this fn should only be called from a single thread, so holding the lock is fine
let mut startup_info = self.startup_info.lock().unwrap();
// merge all items into the disk index now // merge all items into the disk index now
let disk = self.bucket.as_ref().unwrap(); let disk = self.bucket.as_ref().unwrap();
@ -1050,21 +1054,16 @@ impl<T: IndexValue, U: DiskIndexValue + From<T> + Into<T>> InMemAccountsIndex<T,
let new_ref_count = u64::from(!v.is_cached()); let new_ref_count = u64::from(!v.is_cached());
disk.update(&k, |current| { disk.update(&k, |current| {
match current { match current {
Some((current_slot_list, mut ref_count)) => { Some((current_slot_list, ref_count)) => {
// merge this in, mark as duplicate // already on disk, so remember the new (slot, info) for later
duplicates.push((slot, k)); startup_info.duplicates.push((slot, k, entry.1));
if current_slot_list.len() == 1 { if let Some((slot, _)) = current_slot_list.first() {
// accurately account for there being a duplicate for the first entry that was previously added to the disk index. // accurately account for there being a duplicate for the first entry that was previously added to the disk index.
// That entry could not have known yet that it was a duplicate. // That entry could not have known yet that it was a duplicate.
// It is important to capture each slot with a duplicate because of slot limits applied to clean. // It is important to capture each slot with a duplicate because of slot limits applied to clean.
let first_entry_slot = current_slot_list[0].0; startup_info.duplicates_put_on_disk.insert((*slot, k));
duplicates.push((first_entry_slot, k));
} }
let mut slot_list = Vec::with_capacity(current_slot_list.len() + 1); Some((current_slot_list.to_vec(), ref_count))
slot_list.extend_from_slice(current_slot_list);
slot_list.push((entry.0, entry.1.into())); // will never be from the same slot that already exists in the list
ref_count += new_ref_count;
Some((slot_list, ref_count))
} }
None => { None => {
count += 1; count += 1;
@ -1075,22 +1074,28 @@ impl<T: IndexValue, U: DiskIndexValue + From<T> + Into<T>> InMemAccountsIndex<T,
}); });
}); });
self.stats().inc_insert_count(count); self.stats().inc_insert_count(count);
self.startup_info
.lock()
.unwrap()
.duplicates
.append(&mut duplicates);
} }
/// pull out all duplicate pubkeys from 'startup_info' /// pull out all duplicate pubkeys from 'startup_info'
/// duplicate pubkeys have a slot list with len > 1 /// duplicate pubkeys have a slot list with len > 1
/// These were collected for this bin when we did batch inserts in the bg flush threads. /// These were collected for this bin when we did batch inserts in the bg flush threads.
pub fn retrieve_duplicate_keys_from_startup(&self) -> Vec<(Slot, Pubkey)> { /// Insert these into the in-mem index, then return the duplicate (Slot, Pubkey)
pub(crate) fn populate_and_retrieve_duplicate_keys_from_startup(&self) -> Vec<(Slot, Pubkey)> {
let mut write = self.startup_info.lock().unwrap(); let mut write = self.startup_info.lock().unwrap();
// in order to return accurate and complete duplicates, we must have nothing left remaining to insert // in order to return accurate and complete duplicates, we must have nothing left remaining to insert
assert!(write.insert.is_empty()); assert!(write.insert.is_empty());
std::mem::take(&mut write.duplicates) let duplicates = std::mem::take(&mut write.duplicates);
let duplicates_put_on_disk = std::mem::take(&mut write.duplicates_put_on_disk);
drop(write);
duplicates_put_on_disk
.into_iter()
.chain(duplicates.into_iter().map(|(slot, key, info)| {
let entry = PreAllocatedAccountMapEntry::new(slot, info, &self.storage, true);
self.insert_new_entry_if_missing_with_lock(key, entry);
(slot, key)
}))
.collect()
} }
/// synchronize the in-mem index with the disk index /// synchronize the in-mem index with the disk index