From 8051aea88e443f76d44f6ac4d6c0195f1ee26335 Mon Sep 17 00:00:00 2001 From: "Jeff Washington (jwash)" Date: Mon, 3 Apr 2023 08:51:59 -0500 Subject: [PATCH] disk index: store single slot list in index entry (#31002) --- bucket_map/src/bucket.rs | 171 ++++++++++++++++++++----------- bucket_map/src/bucket_storage.rs | 4 +- bucket_map/src/index_entry.rs | 152 +++++++++++++++++---------- 3 files changed, 209 insertions(+), 118 deletions(-) diff --git a/bucket_map/src/bucket.rs b/bucket_map/src/bucket.rs index 9a4c649db..be0d1a831 100644 --- a/bucket_map/src/bucket.rs +++ b/bucket_map/src/bucket.rs @@ -6,6 +6,7 @@ use { bucket_storage::{BucketOccupied, BucketStorage, DEFAULT_CAPACITY_POW2}, index_entry::{ DataBucket, IndexBucket, IndexEntry, IndexEntryPlaceInBucket, MultipleSlots, + OccupiedEnum, }, MaxSearch, RefCount, }, @@ -78,8 +79,14 @@ impl Reallocated { } } +/// when updating the index, this keeps track of the previous data entry which will need to be freed +struct DataFileEntryToFree { + bucket_ix: usize, + location: u64, +} + // >= 2 instances of BucketStorage per 'bucket' in the bucket map. 1 for index, >= 1 for data -pub struct Bucket { +pub struct Bucket { drives: Arc>, //index pub index: BucketStorage>, @@ -263,7 +270,7 @@ impl<'b, T: Clone + Copy + 'static> Bucket { pub fn try_write( &mut self, key: &Pubkey, - data: impl Iterator, + mut data: impl Iterator, data_len: usize, ref_count: RefCount, ) -> Result<(), BucketMapError> { @@ -287,71 +294,115 @@ impl<'b, T: Clone + Copy + 'static> Bucket { }; elem.set_ref_count(&mut self.index, ref_count); - let current_multiple_slots = elem.get_multiple_slots(&self.index); - let bucket_ix = current_multiple_slots.data_bucket_ix(); let num_slots = data_len as u64; - if best_fit_bucket == bucket_ix && current_multiple_slots.num_slots() > 0 { - let current_bucket = &mut self.data[bucket_ix as usize]; - // in place update - let elem_loc = current_multiple_slots.data_loc(current_bucket); - assert!(!current_bucket.is_free(elem_loc)); - let slice: &mut [T] = current_bucket.get_mut_cell_slice(elem_loc, data_len as u64); - let current_multiple_slots = elem.get_multiple_slots_mut(&mut self.index); - current_multiple_slots.set_num_slots(num_slots); - - slice.iter_mut().zip(data).for_each(|(dest, src)| { - *dest = *src; - }); - Ok(()) - } else { - // need to move the allocation to a best fit spot - let best_bucket = &self.data[best_fit_bucket as usize]; - let current_bucket = &self.data[bucket_ix as usize]; - let cap_power = best_bucket.capacity_pow2; - let cap = best_bucket.capacity(); - let pos = thread_rng().gen_range(0, cap); - // max search is increased here by a lot for this search. The idea is that we just have to find an empty bucket somewhere. - // We don't mind waiting on a new write (by searching longer). Writing is done in the background only. - // Wasting space by doubling the bucket size is worse behavior. We expect more - // updates and fewer inserts, so we optimize for more compact data. - // We can accomplish this by increasing how many locations we're willing to search for an empty data cell. - // For the index bucket, it is more like a hash table and we have to exhaustively search 'max_search' to prove an item does not exist. - // And we do have to support the 'does not exist' case with good performance. So, it makes sense to grow the index bucket when it is too large. - // For data buckets, the offset is stored in the index, so it is directly looked up. So, the only search is on INSERT or update to a new sized value. - for i in pos..pos + (max_search * 10).min(cap) { - let ix = i % cap; - if best_bucket.is_free(ix) { - let elem_loc = current_multiple_slots.data_loc(current_bucket); - let old_slots = current_multiple_slots.num_slots(); - let multiple_slots = elem.get_multiple_slots_mut(&mut self.index); - multiple_slots.set_storage_offset(ix); - multiple_slots - .set_storage_capacity_when_created_pow2(best_bucket.capacity_pow2); - multiple_slots.set_num_slots(num_slots); - if old_slots > 0 { - let current_bucket = &mut self.data[bucket_ix as usize]; - current_bucket.free(elem_loc); - } - //debug!( "DATA ALLOC {:?} {} {} {}", key, elem.data_location, best_bucket.capacity, elem_uid ); - if num_slots > 0 { - let best_bucket = &mut self.data[best_fit_bucket as usize]; - best_bucket.occupy(ix, false).unwrap(); - let slice = best_bucket.get_mut_cell_slice(ix, num_slots); - slice.iter_mut().zip(data).for_each(|(dest, src)| { - *dest = *src; - }); - } - return Ok(()); - } + if num_slots <= 1 { + // new data stored should be stored in IndexEntry and NOT in data file + // new data len is 0 or 1 + if let OccupiedEnum::MultipleSlots(multiple_slots) = + elem.get_slot_count_enum(&self.index) + { + let bucket_ix = multiple_slots.data_bucket_ix() as usize; + // free the entry in the data bucket the data was previously stored in + let loc = multiple_slots.data_loc(&self.data[bucket_ix]); + self.data[bucket_ix].free(loc); } - Err(BucketMapError::DataNoSpace((best_fit_bucket, cap_power))) + elem.set_slot_count_enum_value( + &mut self.index, + if let Some(single_element) = data.next() { + OccupiedEnum::OneSlotInIndex(single_element) + } else { + OccupiedEnum::ZeroSlots + }, + ); + return Ok(()); } + + // storing the slot list requires using the data file + let mut old_data_entry_to_free = None; + // see if old elements were in a data file + if let Some(multiple_slots) = elem.get_multiple_slots_mut(&mut self.index) { + let bucket_ix = multiple_slots.data_bucket_ix() as usize; + let current_bucket = &mut self.data[bucket_ix]; + let elem_loc = multiple_slots.data_loc(current_bucket); + + if best_fit_bucket == bucket_ix as u64 { + // in place update in same data file + assert!(!current_bucket.is_free(elem_loc)); + let slice: &mut [T] = current_bucket.get_mut_cell_slice(elem_loc, data_len as u64); + multiple_slots.set_num_slots(num_slots); + + slice.iter_mut().zip(data).for_each(|(dest, src)| { + *dest = *src; + }); + return Ok(()); + } + + // not updating in place, so remember old entry to free + // Wait to free until we make sure we don't have to resize the best_fit_bucket + old_data_entry_to_free = Some(DataFileEntryToFree { + bucket_ix, + location: elem_loc, + }); + } + + // need to move the allocation to a best fit spot + let best_bucket = &self.data[best_fit_bucket as usize]; + let cap_power = best_bucket.capacity_pow2; + let cap = best_bucket.capacity(); + let pos = thread_rng().gen_range(0, cap); + let mut success = false; + // max search is increased here by a lot for this search. The idea is that we just have to find an empty bucket somewhere. + // We don't mind waiting on a new write (by searching longer). Writing is done in the background only. + // Wasting space by doubling the bucket size is worse behavior. We expect more + // updates and fewer inserts, so we optimize for more compact data. + // We can accomplish this by increasing how many locations we're willing to search for an empty data cell. + // For the index bucket, it is more like a hash table and we have to exhaustively search 'max_search' to prove an item does not exist. + // And we do have to support the 'does not exist' case with good performance. So, it makes sense to grow the index bucket when it is too large. + // For data buckets, the offset is stored in the index, so it is directly looked up. So, the only search is on INSERT or update to a new sized value. + for i in pos..pos + (max_search * 10).min(cap) { + let ix = i % cap; + if best_bucket.is_free(ix) { + let mut multiple_slots = MultipleSlots::default(); + multiple_slots.set_storage_offset(ix); + multiple_slots.set_storage_capacity_when_created_pow2(best_bucket.capacity_pow2); + multiple_slots.set_num_slots(num_slots); + elem.set_slot_count_enum_value( + &mut self.index, + OccupiedEnum::MultipleSlots(&multiple_slots), + ); + //debug!( "DATA ALLOC {:?} {} {} {}", key, elem.data_location, best_bucket.capacity, elem_uid ); + if num_slots > 0 { + // copy slotlist into the data bucket + let best_bucket = &mut self.data[best_fit_bucket as usize]; + best_bucket.occupy(ix, false).unwrap(); + let slice = best_bucket.get_mut_cell_slice(ix, num_slots); + slice.iter_mut().zip(data).for_each(|(dest, src)| { + *dest = *src; + }); + } + success = true; + break; + } + } + if !success { + return Err(BucketMapError::DataNoSpace((best_fit_bucket, cap_power))); + } + if let Some(DataFileEntryToFree { + bucket_ix, + location, + }) = old_data_entry_to_free + { + // free the entry in the data bucket the data was previously stored in + self.data[bucket_ix].free(location); + } + Ok(()) } pub fn delete_key(&mut self, key: &Pubkey) { if let Some((elem, elem_ix)) = self.find_index_entry(key) { - let multiple_slots = elem.get_multiple_slots_mut(&mut self.index); - if multiple_slots.num_slots() > 0 { + if let OccupiedEnum::MultipleSlots(multiple_slots) = + elem.get_slot_count_enum(&self.index) + { let ix = multiple_slots.data_bucket_ix() as usize; let data_bucket = &self.data[ix]; let loc = multiple_slots.data_loc(data_bucket); diff --git a/bucket_map/src/bucket_storage.rs b/bucket_map/src/bucket_storage.rs index a8e9b953a..89a41fbee 100644 --- a/bucket_map/src/bucket_storage.rs +++ b/bucket_map/src/bucket_storage.rs @@ -190,13 +190,13 @@ impl BucketStorage { unsafe { slice.get_unchecked_mut(0) } } - pub(crate) fn get_mut_from_parts(item_slice: &mut [u8]) -> &mut T { + pub(crate) fn get_mut_from_parts(item_slice: &mut [u8]) -> &mut T { debug_assert!(std::mem::size_of::() <= item_slice.len()); let item = item_slice.as_mut_ptr() as *mut T; unsafe { &mut *item } } - pub(crate) fn get_from_parts(item_slice: &[u8]) -> &T { + pub(crate) fn get_from_parts(item_slice: &[u8]) -> &T { debug_assert!(std::mem::size_of::() <= item_slice.len()); let item = item_slice.as_ptr() as *const T; unsafe { &*item } diff --git a/bucket_map/src/index_entry.rs b/bucket_map/src/index_entry.rs index 1a68bb822..5f7157934 100644 --- a/bucket_map/src/index_entry.rs +++ b/bucket_map/src/index_entry.rs @@ -44,12 +44,12 @@ pub struct IndexBucketUsingRefCountBits { _phantom: PhantomData<&'static T>, } -impl BucketOccupied for IndexBucketUsingRefCountBits { +impl BucketOccupied for IndexBucketUsingRefCountBits { fn occupy(&mut self, element: &mut [u8], ix: usize) { assert!(self.is_free(element, ix)); let entry: &mut IndexEntry = BucketStorage::>::get_mut_from_parts(element); - entry.set_slot_count_enum_value(OccupiedEnum::Occupied); + entry.set_slot_count_enum_value(OccupiedEnum::ZeroSlots); } fn free(&mut self, element: &mut [u8], ix: usize) { assert!(!self.is_free(element, ix)); @@ -86,11 +86,11 @@ pub struct IndexEntryPlaceInBucket { #[derive(Copy, Clone)] /// one instance of this per item in the index /// stored in the index bucket -pub struct IndexEntry { +pub struct IndexEntry { pub(crate) key: Pubkey, // can this be smaller if we have reduced the keys into buckets already? packed_ref_count: PackedRefCount, - multiple_slots: MultipleSlots, - _phantom: PhantomData<&'static T>, + /// depends on the contents of ref_count.slot_count_enum + contents: SingleElementOrMultipleSlots, } /// 62 bits available for ref count @@ -173,32 +173,66 @@ impl MultipleSlots { } } -#[repr(u8)] -#[derive(Debug, Eq, PartialEq)] -pub(crate) enum OccupiedEnum { - /// this spot is free (ie. not occupied) - Free = 0, - /// this spot is occupied - Occupied = 1, +#[repr(C)] +#[derive(Copy, Clone)] +pub(crate) union SingleElementOrMultipleSlots { + /// the slot list contains a single element. No need for an entry in the data file. + /// The element itself is stored in place in the index entry + pub(crate) single_element: T, + /// the slot list ocntains more than one element. This contains the reference to the data file. + pub(crate) multiple_slots: MultipleSlots, } -impl IndexEntry { - /// enum value stored in 2 spare bits taken from ref_count - fn get_slot_count_enum(&self) -> OccupiedEnum { - match self.packed_ref_count.slot_count_enum() { - 0 => OccupiedEnum::Free, - 1 => OccupiedEnum::Occupied, - _ => { - panic!("unexpected value"); +#[repr(u8)] +#[derive(Debug, Eq, PartialEq)] +pub(crate) enum OccupiedEnum<'a, T> { + /// this spot is not occupied. + /// ALL other enum values ARE occupied. + Free = 0, + /// zero slots in the slot list + ZeroSlots = 1, + /// one slot in the slot list, it is stored in the index + OneSlotInIndex(&'a T) = 2, + /// data is stored in data file + MultipleSlots(&'a MultipleSlots) = 3, +} + +impl IndexEntry { + pub(crate) fn get_slot_count_enum(&self) -> OccupiedEnum<'_, T> { + unsafe { + match self.packed_ref_count.slot_count_enum() { + 0 => OccupiedEnum::Free, + 1 => OccupiedEnum::ZeroSlots, + 2 => OccupiedEnum::OneSlotInIndex(&self.contents.single_element), + 3 => OccupiedEnum::MultipleSlots(&self.contents.multiple_slots), + _ => { + panic!("unexpected value"); + } } } } - /// enum value stored in 2 spare bits taken from ref_count - fn set_slot_count_enum_value(&mut self, value: OccupiedEnum) { + pub(crate) fn get_multiple_slots_mut(&mut self) -> Option<&mut MultipleSlots> { + unsafe { + match self.packed_ref_count.slot_count_enum() { + 3 => Some(&mut self.contents.multiple_slots), + _ => None, + } + } + } + + pub(crate) fn set_slot_count_enum_value<'a>(&'a mut self, value: OccupiedEnum<'a, T>) { self.packed_ref_count.set_slot_count_enum(match value { OccupiedEnum::Free => 0, - OccupiedEnum::Occupied => 1, + OccupiedEnum::ZeroSlots => 1, + OccupiedEnum::OneSlotInIndex(single_element) => { + self.contents.single_element = *single_element; + 2 + } + OccupiedEnum::MultipleSlots(multiple_slots) => { + self.contents.multiple_slots = *multiple_slots; + 3 + } }); } } @@ -212,42 +246,36 @@ struct PackedStorage { offset: B56, } -impl IndexEntryPlaceInBucket { +impl IndexEntryPlaceInBucket { pub fn init(&self, index_bucket: &mut BucketStorage>, pubkey: &Pubkey) { + self.set_slot_count_enum_value(index_bucket, OccupiedEnum::ZeroSlots); let index_entry = index_bucket.get_mut::>(self.ix); index_entry.key = *pubkey; index_entry.packed_ref_count.set_ref_count(0); - index_entry.multiple_slots = MultipleSlots::default(); - } - - pub(crate) fn get_multiple_slots<'a>( - &self, - index_bucket: &'a BucketStorage>, - ) -> &'a MultipleSlots { - &index_bucket.get::>(self.ix).multiple_slots } + /// return Some(MultipleSlots) if this item's data is stored in the data file pub(crate) fn get_multiple_slots_mut<'a>( &self, index_bucket: &'a mut BucketStorage>, - ) -> &'a mut MultipleSlots { - &mut index_bucket - .get_mut::>(self.ix) - .multiple_slots + ) -> Option<&'a mut MultipleSlots> { + let index_entry = index_bucket.get_mut::>(self.ix); + index_entry.get_multiple_slots_mut() } - pub(crate) fn get_slot_count_enum( + pub(crate) fn get_slot_count_enum<'a>( &self, - index_bucket: &BucketStorage>, - ) -> OccupiedEnum { + index_bucket: &'a BucketStorage>, + ) -> OccupiedEnum<'a, T> { let index_entry = index_bucket.get::>(self.ix); index_entry.get_slot_count_enum() } - pub(crate) fn set_slot_count_enum_value( + /// make this index entry reflect `value` + pub(crate) fn set_slot_count_enum_value<'a>( &self, - index_bucket: &mut BucketStorage>, - value: OccupiedEnum, + index_bucket: &'a mut BucketStorage>, + value: OccupiedEnum<'a, T>, ) { let index_entry = index_bucket.get_mut::>(self.ix); index_entry.set_slot_count_enum_value(value); @@ -260,22 +288,34 @@ impl IndexEntryPlaceInBucket { pub fn read_value<'a>( &self, - index_bucket: &BucketStorage>, + index_bucket: &'a BucketStorage>, data_buckets: &'a [BucketStorage], ) -> Option<(&'a [T], RefCount)> { - let multiple_slots = self.get_multiple_slots(index_bucket); - let num_slots = multiple_slots.num_slots(); - let slice = if num_slots > 0 { - let data_bucket_ix = multiple_slots.data_bucket_ix(); - let data_bucket = &data_buckets[data_bucket_ix as usize]; - let loc = multiple_slots.data_loc(data_bucket); - assert!(!data_bucket.is_free(loc)); - data_bucket.get_cell_slice(loc, num_slots) - } else { - // num_slots is 0. This means we don't have an actual allocation. - &[] - }; - Some((slice, self.ref_count(index_bucket))) + Some(( + match self.get_slot_count_enum(index_bucket) { + OccupiedEnum::ZeroSlots => { + // num_slots is 0. This means we don't have an actual allocation. + &[] + } + OccupiedEnum::OneSlotInIndex(single_element) => { + // only element is stored in the index entry + // Note that the lifetime comes from `index_bucket` here. + std::slice::from_ref(single_element) + } + OccupiedEnum::MultipleSlots(multiple_slots) => { + // data is in data file, so return a ref to that data + let data_bucket_ix = multiple_slots.data_bucket_ix(); + let data_bucket = &data_buckets[data_bucket_ix as usize]; + let loc = multiple_slots.data_loc(data_bucket); + assert!(!data_bucket.is_free(loc)); + data_bucket.get_cell_slice::(loc, multiple_slots.num_slots) + } + _ => { + panic!("trying to read data from a free entry"); + } + }, + self.ref_count(index_bucket), + )) } pub fn new(ix: u64) -> Self {