From 3dcb3827314517eeec6018cbe28e1ae535437f76 Mon Sep 17 00:00:00 2001 From: "Jeff Washington (jwash)" Date: Mon, 31 Jul 2023 13:13:19 -0700 Subject: [PATCH] avoid giant memory allocation in hash calc (#32646) * avoid giant memory allocation in hash calc * update comment * reorder to avoid clone * simplify references * update comment on get_item --- runtime/src/accounts_db.rs | 9 +- runtime/src/accounts_hash.rs | 451 ++++++++++++++++++++--------------- runtime/src/pubkey_bins.rs | 9 +- 3 files changed, 268 insertions(+), 201 deletions(-) diff --git a/runtime/src/accounts_db.rs b/runtime/src/accounts_db.rs index 11512958b0..ac62760c34 100644 --- a/runtime/src/accounts_db.rs +++ b/runtime/src/accounts_db.rs @@ -7654,16 +7654,9 @@ impl AccountsDb { .map(|d| d.get_cache_hash_data()) .collect::>(); - // rework slices of data into bins for parallel processing and to match data shape expected by 'rest_of_hash_calculation' - let result = AccountsHasher::get_binned_data( - &cache_hash_intermediates, - PUBKEY_BINS_FOR_CALCULATING_HASHES, - &bounds, - ); - // turn raw data into merkle tree hashes and sum of lamports let (accounts_hash, capitalization) = - accounts_hasher.rest_of_hash_calculation(result, &mut stats); + accounts_hasher.rest_of_hash_calculation(&cache_hash_intermediates, &mut stats); let accounts_hash = match flavor { CalcAccountsHashFlavor::Full => AccountsHashEnum::Full(AccountsHash(accounts_hash)), CalcAccountsHashFlavor::Incremental => { diff --git a/runtime/src/accounts_hash.rs b/runtime/src/accounts_hash.rs index 8aeea1600e..9cf43f3a1f 100644 --- a/runtime/src/accounts_hash.rs +++ b/runtime/src/accounts_hash.rs @@ -2,13 +2,13 @@ use { crate::{ accounts_db::{AccountStorageEntry, IncludeSlotInHash, PUBKEY_BINS_FOR_CALCULATING_HASHES}, ancestors::Ancestors, + pubkey_bins::PubkeyBinCalculator24, rent_collector::RentCollector, }, - core::ops::Range, log::*, memmap2::MmapMut, rayon::prelude::*, - solana_measure::measure::Measure, + solana_measure::{measure::Measure, measure_us}, solana_sdk::{ hash::{Hash, Hasher}, pubkey::Pubkey, @@ -30,9 +30,6 @@ use { }; pub const MERKLE_FANOUT: usize = 16; -/// the data passed through the processing functions -pub type SortedDataByPubkey<'a> = Vec<&'a [CalculateHashIntermediate]>; - /// 1 file containing account hashes sorted by pubkey, mapped into memory struct MmapAccountHashesFile { mmap: MmapMut, @@ -163,6 +160,7 @@ pub struct HashStats { pub longest_ancient_scan_us: AtomicU64, pub sum_ancient_scans_us: AtomicU64, pub count_ancient_scans: AtomicU64, + pub pubkey_bin_search_us: AtomicU64, } impl HashStats { pub fn calc_storage_size_quartiles(&mut self, storages: &[Arc]) { @@ -262,6 +260,11 @@ impl HashStats { .load(Ordering::Relaxed), i64 ), + ( + "pubkey_bin_search_us", + self.pubkey_bin_search_us.load(Ordering::Relaxed), + i64 + ), ); } } @@ -767,61 +770,13 @@ impl AccountsHasher { }) } - /// return references to cache hash data, grouped by bin, sourced from 'sorted_data_by_pubkey', - /// which is probably a mmapped file. - pub(crate) fn get_binned_data<'a>( - sorted_data_by_pubkey: &'a Vec<&'a [CalculateHashIntermediate]>, - bins: usize, - bin_range: &Range, - ) -> Vec> { - // get slices per bin from each slice - use crate::pubkey_bins::PubkeyBinCalculator24; - let binner = PubkeyBinCalculator24::new(bins); - sorted_data_by_pubkey - .par_iter() - .map(|all_bins| { - let mut last_start_index = 0; - let mut result = Vec::with_capacity(bin_range.len()); - let mut current_bin = bin_range.start; - let max_inclusive = all_bins.len(); - for i in 0..=max_inclusive { - let this_bin = if i != max_inclusive { - let entry = &all_bins[i]; - let this_bin = binner.bin_from_pubkey(&entry.pubkey); - if this_bin == current_bin { - // this pk is in the same bin as we're currently investigating, so keep iterating - continue; - } - this_bin - } else { - // we exhausted the source data, so 'this bin' is now the end (exclusive) bin - // this case exists to handle the +1 case - bin_range.end - }; - // we found the first pubkey in the bin after the bin we were investigating - // or we passed the end of the input list. - // So, the bin we were investigating is now complete. - result.push(&all_bins[last_start_index..i]); - last_start_index = i; - ((current_bin + 1)..this_bin).for_each(|_| { - // the source data could contain a pubey from bin 1, then bin 5, skipping the bins in between. - // In that case, fill in 2..5 with empty - result.push(&all_bins[0..0]); // empty slice - }); - current_bin = this_bin; - } - result - }) - .collect::>() - } - /// returns: /// Vec, with one entry per bin /// for each entry, Vec in pubkey order /// If return Vec was flattened, it would be all hashes, in pubkey order. - fn de_dup_accounts<'a>( + fn de_dup_accounts( &self, - sorted_data_by_pubkey: &'a [SortedDataByPubkey<'a>], + sorted_data_by_pubkey: &[&[CalculateHashIntermediate]], stats: &mut HashStats, max_bin: usize, ) -> (Vec, u64) { @@ -836,17 +791,14 @@ impl AccountsHasher { let hashes: Vec<_> = (0..max_bin) .into_par_iter() .map(|bin| { - let (hashes_file, lamports_bin, unreduced_entries_count) = - self.de_dup_accounts_in_parallel(sorted_data_by_pubkey, bin); + let (hashes_file, lamports_bin) = + self.de_dup_accounts_in_parallel(sorted_data_by_pubkey, bin, max_bin, stats); { let mut lock = min_max_sum_entries_hashes.lock().unwrap(); - let (mut min, mut max, mut lamports_sum, mut entries, mut hash_total) = *lock; - min = std::cmp::min(min, unreduced_entries_count); - max = std::cmp::max(max, unreduced_entries_count); + let (min, max, mut lamports_sum, entries, mut hash_total) = *lock; lamports_sum = Self::checked_cast_for_capitalization( lamports_sum as u128 + lamports_bin as u128, ); - entries += unreduced_entries_count; hash_total += hashes_file.count(); *lock = (min, max, lamports_sum, entries, hash_total); } @@ -864,42 +816,135 @@ impl AccountsHasher { (hashes, lamports_sum) } - // returns true if this vector was exhausted - fn get_item<'a, 'b>( + /// returns the item referenced by `min_index` + /// updates `indexes` to skip over the pubkey and its duplicates + /// updates `first_items` to point to the next pubkey + /// or removes the entire pubkey division entries (for `min_index`) if the referenced pubkey is the last entry in the same `bin` + /// removed from: `first_items`, `indexes`, and `first_item_pubkey_division` + fn get_item<'a>( min_index: usize, bin: usize, - first_items: &'a mut Vec, - pubkey_division: &'b [SortedDataByPubkey<'b>], - indexes: &'a mut [usize], - first_item_to_pubkey_division: &'a mut Vec, - ) -> &'b CalculateHashIntermediate { + first_items: &mut Vec, + sorted_data_by_pubkey: &[&'a [CalculateHashIntermediate]], + indexes: &mut Vec, + first_item_to_pubkey_division: &mut Vec, + binner: &PubkeyBinCalculator24, + ) -> &'a CalculateHashIntermediate { let first_item = first_items[min_index]; let key = &first_item; let division_index = first_item_to_pubkey_division[min_index]; - let bin = &pubkey_division[division_index][bin]; - let mut index = indexes[division_index]; + let division_data = &sorted_data_by_pubkey[division_index]; + let mut index = indexes[min_index]; index += 1; - while index < bin.len() { + let mut end; + loop { + end = index >= division_data.len(); + if end { + break; + } // still more items where we found the previous key, so just increment the index for that slot group, skipping all pubkeys that are equal - if &bin[index].pubkey == key { + let next_key = &division_data[index].pubkey; + if next_key == key { index += 1; continue; // duplicate entries of same pubkey, so keep skipping } + if binner.bin_from_pubkey(next_key) > bin { + // the next pubkey is not in our bin + end = true; + break; + } + // point to the next pubkey > key - first_items[min_index] = bin[index].pubkey; - indexes[division_index] = index; + first_items[min_index] = *next_key; + indexes[min_index] = index; break; } - - if index >= bin.len() { + if end { // stop looking in this vector - we exhausted it first_items.remove(min_index); first_item_to_pubkey_division.remove(min_index); + indexes.remove(min_index); } // this is the previous first item that was requested - &bin[index - 1] + &division_data[index - 1] + } + + /// `hash_data` must be sorted by `binner.bin_from_pubkey()` + /// return index in `hash_data` of first pubkey that is in `bin`, based on `binner` + fn binary_search_for_first_pubkey_in_bin( + hash_data: &[CalculateHashIntermediate], + bin: usize, + binner: &PubkeyBinCalculator24, + ) -> Option { + let potential_index = if bin == 0 { + // `bin` == 0 is special because there cannot be `bin`-1 + // so either element[0] is in bin 0 or there is nothing in bin 0. + 0 + } else { + // search for the first pubkey that is in `bin` + // There could be many keys in a row with the same `bin`. + // So, for each pubkey, use calculated_bin * 2 + 1 as the bin of a given pubkey for binary search. + // And compare the bin of each pubkey with `bin` * 2. + // So all keys that are in `bin` will compare as `bin` * 2 + 1 + // all keys that are in `bin`-1 will compare as ((`bin` - 1) * 2 + 1), which is (`bin` * 2 - 1) + // NO keys will compare as `bin` * 2 because we add 1. + // So, the binary search will NEVER return Ok(found_index), but will always return Err(index of first key in `bin`). + // Note that if NO key is in `bin`, then the key at the found index will be in a bin > `bin`, so return None. + let just_prior_to_desired_bin = bin * 2; + let search = hash_data.binary_search_by(|data| { + (1 + 2 * binner.bin_from_pubkey(&data.pubkey)).cmp(&just_prior_to_desired_bin) + }); + // returns Err(index where item should be) since the desired item will never exist + search.expect_err("it is impossible to find a matching bin") + }; + // note that `potential_index` could be == hash_data.len(). This indicates the first key in `bin` would be + // after the data we have. Thus, no key is in `bin`. + // This also handles the case where `hash_data` is empty, since len() will be 0 and `get` will return None. + hash_data.get(potential_index).and_then(|potential_data| { + (binner.bin_from_pubkey(&potential_data.pubkey) == bin).then_some(potential_index) + }) + } + + /// `hash_data` must be sorted by `binner.bin_from_pubkey()` + /// return index in `hash_data` of first pubkey that is in `bin`, based on `binner` + fn find_first_pubkey_in_bin( + hash_data: &[CalculateHashIntermediate], + bin: usize, + bins: usize, + binner: &PubkeyBinCalculator24, + stats: &HashStats, + ) -> Option { + if hash_data.is_empty() { + return None; + } + let (result, us) = measure_us!({ + // assume uniform distribution of pubkeys and choose first guess based on bin we're looking for + let i = hash_data.len() * bin / bins; + let estimate = &hash_data[i]; + + let pubkey_bin = binner.bin_from_pubkey(&estimate.pubkey); + let range = if pubkey_bin >= bin { + // i pubkey matches or is too large, so look <= i for the first pubkey in the right bin + // i+1 could be the first pubkey in the right bin + 0..(i + 1) + } else { + // i pubkey is too small, so look after i + (i + 1)..hash_data.len() + }; + Some( + range.start + + // binary search the subset + Self::binary_search_for_first_pubkey_in_bin( + &hash_data[range], + bin, + binner, + )?, + ) + }); + stats.pubkey_bin_search_us.fetch_add(us, Ordering::Relaxed); + result } // go through: [..][pubkey_bin][..] and return hashes and lamport sum @@ -909,35 +954,39 @@ impl AccountsHasher { // 3. produce this output: // a. AccountHashesFile: individual account hashes in pubkey order // b. lamport sum - // c. unreduced count (ie. including duplicates and zero lamport) - fn de_dup_accounts_in_parallel<'a>( + fn de_dup_accounts_in_parallel( &self, - pubkey_division: &'a [SortedDataByPubkey<'a>], + sorted_data_by_pubkey: &[&[CalculateHashIntermediate]], pubkey_bin: usize, - ) -> (AccountHashesFile, u64, usize) { - let len = pubkey_division.len(); - let mut unreduced_count = 0; - let mut indexes = vec![0; len]; + bins: usize, + stats: &HashStats, + ) -> (AccountHashesFile, u64) { + let binner = PubkeyBinCalculator24::new(bins); + + let len = sorted_data_by_pubkey.len(); + let mut indexes = Vec::with_capacity(len); let mut first_items = Vec::with_capacity(len); - // map from index of an item in first_items[] to index of the corresponding item in pubkey_division[] - // this will change as items in pubkey_division[] are exhausted + // map from index of an item in first_items[] to index of the corresponding item in sorted_data_by_pubkey[] + // this will change as items in sorted_data_by_pubkey[] are exhausted let mut first_item_to_pubkey_division = Vec::with_capacity(len); let mut hashes = AccountHashesFile { count_and_writer: None, dir_for_temp_cache_files: self.dir_for_temp_cache_files.clone(), }; // initialize 'first_items', which holds the current lowest item in each slot group - pubkey_division.iter().enumerate().for_each(|(i, bins)| { - // check to make sure we can do bins[pubkey_bin] - if bins.len() > pubkey_bin { - let sub = bins[pubkey_bin]; - if !sub.is_empty() { - unreduced_count += bins[pubkey_bin].len(); // sum for metrics - first_items.push(bins[pubkey_bin][0].pubkey); + sorted_data_by_pubkey + .iter() + .enumerate() + .for_each(|(i, hash_data)| { + let first_pubkey_in_bin = + Self::find_first_pubkey_in_bin(hash_data, pubkey_bin, bins, &binner, stats); + if let Some(first_pubkey_in_bin) = first_pubkey_in_bin { + let k = hash_data[first_pubkey_in_bin].pubkey; + first_items.push(k); first_item_to_pubkey_division.push(i); + indexes.push(first_pubkey_in_bin); } - } - }); + }); let mut overall_sum = 0; let mut duplicate_pubkey_indexes = Vec::with_capacity(len); let filler_accounts_enabled = self.filler_accounts_enabled(); @@ -976,9 +1025,10 @@ impl AccountsHasher { min_index, pubkey_bin, &mut first_items, - pubkey_division, + sorted_data_by_pubkey, &mut indexes, &mut first_item_to_pubkey_division, + &binner, ); // add lamports and get hash @@ -1010,15 +1060,17 @@ impl AccountsHasher { *i, pubkey_bin, &mut first_items, - pubkey_division, + sorted_data_by_pubkey, &mut indexes, &mut first_item_to_pubkey_division, + &binner, ); }); duplicate_pubkey_indexes.clear(); } } - (hashes, overall_sum, unreduced_count) + + (hashes, overall_sum) } fn is_filler_account(&self, pubkey: &Pubkey) -> bool { @@ -1030,15 +1082,14 @@ impl AccountsHasher { // input: // vec: group of slot data, ordered by Slot (low to high) - // vec: [0..bins] - where bins are pubkey ranges (these are ordered by Pubkey range) - // vec: [..] - items which fit in the containing bin. Sorted by: Pubkey, higher Slot, higher Write version (if pubkey =) + // vec: [..] - items which fit in the containing bin. Sorted by: Pubkey, higher Slot, higher Write version (if pubkey =) pub fn rest_of_hash_calculation( &self, - data_sections_by_pubkey: Vec>, + sorted_data_by_pubkey: &[&[CalculateHashIntermediate]], stats: &mut HashStats, ) -> (Hash, u64) { let (hashes, total_lamports) = self.de_dup_accounts( - &data_sections_by_pubkey, + sorted_data_by_pubkey, stats, PUBKEY_BINS_FOR_CALCULATING_HASHES, ); @@ -1105,7 +1156,7 @@ pub struct AccountsDeltaHash(pub Hash); #[cfg(test)] pub mod tests { - use {super::*, std::str::FromStr, tempfile::tempdir}; + use {super::*, itertools::Itertools, std::str::FromStr, tempfile::tempdir}; impl AccountsHasher { fn new(dir_for_temp_cache_files: PathBuf) -> Self { @@ -1126,6 +1177,59 @@ pub mod tests { } } + #[test] + fn test_find_first_pubkey_in_bin() { + let stats = HashStats::default(); + for (bins, expected_count) in [1, 2, 4].into_iter().zip([5, 20, 120]) { + let bins: usize = bins; + let binner = PubkeyBinCalculator24::new(bins); + + let mut count = 0usize; + // # pubkeys in each bin are permutations of these + // 0 means none in this bin + // large number (20) means the found key will be well before or after the expected index based on an assumption of uniform distribution + for counts in [0, 1, 2, 20, 0].into_iter().permutations(bins) { + count += 1; + let hash_data = counts + .iter() + .enumerate() + .flat_map(|(bin, count)| { + (0..*count).map(move |_| { + let binner = PubkeyBinCalculator24::new(bins); + CalculateHashIntermediate::new( + Hash::default(), + 0, + binner.lowest_pubkey_from_bin(bin, bins), + ) + }) + }) + .collect::>(); + // look for the first pubkey in each bin + for (bin, count_in_bin) in counts.iter().enumerate().take(bins) { + let first = AccountsHasher::find_first_pubkey_in_bin( + &hash_data, bin, bins, &binner, &stats, + ); + // test both functions + let first_again = AccountsHasher::binary_search_for_first_pubkey_in_bin( + &hash_data, bin, &binner, + ); + assert_eq!(first, first_again); + assert_eq!(first.is_none(), count_in_bin == &0); + if let Some(first) = first { + assert_eq!(binner.bin_from_pubkey(&hash_data[first].pubkey), bin); + if first > 0 { + assert!(binner.bin_from_pubkey(&hash_data[first - 1].pubkey) < bin); + } + } + } + } + assert_eq!( + count, expected_count, + "too few iterations in test. bins: {bins}" + ); + } + } + #[test] fn test_account_hashes_file() { let dir_for_temp_cache_files = tempdir().unwrap(); @@ -1234,8 +1338,8 @@ pub mod tests { assert_eq!(AccountsHasher::div_ceil(10, 0), 0); } - fn for_rest(original: &[CalculateHashIntermediate]) -> Vec> { - vec![vec![original]] + fn for_rest(original: &[CalculateHashIntermediate]) -> Vec<&[CalculateHashIntermediate]> { + vec![original] } #[test] @@ -1258,7 +1362,7 @@ pub mod tests { let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hash = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); let result = accounts_hash - .rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default()); + .rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default()); let expected_hash = Hash::from_str("8j9ARGFv4W2GfML7d3sVJK2MePwrikqYnu6yqer28cCa").unwrap(); assert_eq!((result.0, result.1), (expected_hash, 88)); @@ -1269,7 +1373,7 @@ pub mod tests { account_maps.insert(0, val); let result = accounts_hash - .rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default()); + .rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default()); let expected_hash = Hash::from_str("EHv9C5vX7xQjjMpsJMzudnDTzoTSRwYkqLzY8tVMihGj").unwrap(); assert_eq!((result.0, result.1), (expected_hash, 108)); @@ -1280,7 +1384,7 @@ pub mod tests { account_maps.insert(1, val); let result = accounts_hash - .rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default()); + .rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default()); let expected_hash = Hash::from_str("7NNPg5A8Xsg1uv4UFm6KZNwsipyyUnmgCrznP6MBWoBZ").unwrap(); assert_eq!((result.0, result.1), (expected_hash, 118)); } @@ -1295,15 +1399,16 @@ pub mod tests { #[test] fn test_accountsdb_de_dup_accounts_zero_chunks() { - let vec = [vec![vec![CalculateHashIntermediate { + let vec = vec![vec![CalculateHashIntermediate { lamports: 1, ..CalculateHashIntermediate::default() - }]]]; + }]]; let temp_vec = vec.to_vec(); - let slice = convert_to_slice2(&temp_vec); + let slice = convert_to_slice(&temp_vec); let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); - let (mut hashes, lamports, _) = accounts_hasher.de_dup_accounts_in_parallel(&slice, 0); + let (mut hashes, lamports) = + accounts_hasher.de_dup_accounts_in_parallel(&slice, 0, 1, &HashStats::default()); assert_eq!(&[Hash::default()], hashes.get_reader().unwrap().1.read(0)); assert_eq!(lamports, 1); } @@ -1324,9 +1429,10 @@ pub mod tests { let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hash = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); - let vec = vec![vec![], vec![]]; + let empty = []; + let vec = ∅ let (hashes, lamports) = - accounts_hash.de_dup_accounts(&vec, &mut HashStats::default(), one_range()); + accounts_hash.de_dup_accounts(vec, &mut HashStats::default(), one_range()); assert_eq!( vec![Hash::default(); 0], get_vec_vec(hashes) @@ -1342,11 +1448,13 @@ pub mod tests { assert_eq!(empty, get_vec_vec(hashes)); assert_eq!(lamports, 0); - let (hashes, lamports, _) = accounts_hash.de_dup_accounts_in_parallel(&[], 1); + let (hashes, lamports) = + accounts_hash.de_dup_accounts_in_parallel(&[], 1, 1, &HashStats::default()); assert_eq!(vec![Hash::default(); 0], get_vec(hashes)); assert_eq!(lamports, 0); - let (hashes, lamports, _) = accounts_hash.de_dup_accounts_in_parallel(&[], 2); + let (hashes, lamports) = + accounts_hash.de_dup_accounts_in_parallel(&[], 2, 1, &HashStats::default()); assert_eq!(vec![Hash::default(); 0], get_vec(hashes)); assert_eq!(lamports, 0); } @@ -1427,24 +1535,31 @@ pub mod tests { let accounts = accounts.clone(); let slice = &accounts[start..end]; - let slice2 = vec![vec![slice.to_vec()]]; + let slice2 = vec![slice.to_vec()]; let slice = &slice2[..]; - let slice_temp = convert_to_slice2(&slice2); - let (hashes2, lamports2, _) = hash.de_dup_accounts_in_parallel(&slice_temp, 0); - let slice3 = convert_to_slice2(&slice2); - let (hashes3, lamports3, _) = hash.de_dup_accounts_in_parallel(&slice3, 0); + let slice_temp = convert_to_slice(&slice2); + let (hashes2, lamports2) = + hash.de_dup_accounts_in_parallel(&slice_temp, 0, 1, &HashStats::default()); + let slice3 = convert_to_slice(&slice2); + let (hashes3, lamports3) = + hash.de_dup_accounts_in_parallel(&slice3, 0, 1, &HashStats::default()); let vec = slice.to_vec(); - let slice4 = convert_to_slice2(&vec); + let slice4 = convert_to_slice(&vec); + let mut max_bin = end - start; + if !max_bin.is_power_of_two() { + max_bin = 1; + } + let (hashes4, lamports4) = - hash.de_dup_accounts(&slice4, &mut HashStats::default(), end - start); + hash.de_dup_accounts(&slice4, &mut HashStats::default(), max_bin); let vec = slice.to_vec(); - let slice5 = convert_to_slice2(&vec); + let slice5 = convert_to_slice(&vec); let (hashes5, lamports5) = - hash.de_dup_accounts(&slice5, &mut HashStats::default(), end - start); + hash.de_dup_accounts(&slice5, &mut HashStats::default(), max_bin); let vec = slice.to_vec(); - let slice5 = convert_to_slice2(&vec); + let slice5 = convert_to_slice(&vec); let (hashes6, lamports6) = - hash.de_dup_accounts(&slice5, &mut HashStats::default(), end - start); + hash.de_dup_accounts(&slice5, &mut HashStats::default(), max_bin); let hashes2 = get_vec(hashes2); let hashes3 = get_vec(hashes3); @@ -1473,7 +1588,7 @@ pub mod tests { assert_eq!(lamports2, lamports5); assert_eq!(lamports2, lamports6); - let human_readable = slice[0][0] + let human_readable = slice[0] .iter() .map(|v| { let mut s = (if v.pubkey == key_a { @@ -1549,11 +1664,11 @@ pub mod tests { } fn test_de_dup_accounts_in_parallel<'a>( - account_maps: &'a [SortedDataByPubkey<'a>], - ) -> (AccountHashesFile, u64, usize) { + account_maps: &'a [&'a [CalculateHashIntermediate]], + ) -> (AccountHashesFile, u64) { let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); - accounts_hasher.de_dup_accounts_in_parallel(account_maps, 0) + accounts_hasher.de_dup_accounts_in_parallel(account_maps, 0, 1, &HashStats::default()) } #[test] @@ -1566,22 +1681,22 @@ pub mod tests { let val = CalculateHashIntermediate::new(hash, 1, key); account_maps.push(val.clone()); - let vecs = vec![vec![account_maps.to_vec()]]; - let slice = convert_to_slice2(&vecs); - let (hashfile, lamports, count) = test_de_dup_accounts_in_parallel(&slice); + let vecs = vec![account_maps.to_vec()]; + let slice = convert_to_slice(&vecs); + let (hashfile, lamports) = test_de_dup_accounts_in_parallel(&slice); assert_eq!( - (get_vec(hashfile), lamports, count), - (vec![val.hash], val.lamports, 1) + (get_vec(hashfile), lamports), + (vec![val.hash], val.lamports) ); // zero original lamports, higher version let val = CalculateHashIntermediate::new(hash, 0, key); account_maps.push(val); // has to be after previous entry since account_maps are in slot order - let vecs = vec![vec![account_maps.to_vec()]]; - let slice = convert_to_slice2(&vecs); - let (hashfile, lamports, count) = test_de_dup_accounts_in_parallel(&slice); - assert_eq!((get_vec(hashfile), lamports, count), (vec![], 0, 2)); + let vecs = vec![account_maps.to_vec()]; + let slice = convert_to_slice(&vecs); + let (hashfile, lamports) = test_de_dup_accounts_in_parallel(&slice); + assert_eq!((get_vec(hashfile), lamports), (vec![], 0)); } #[test] @@ -1931,7 +2046,12 @@ pub mod tests { ]; let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); - accounts_hasher.de_dup_accounts_in_parallel(&[convert_to_slice(&[input])], 0); + accounts_hasher.de_dup_accounts_in_parallel( + &convert_to_slice(&[input]), + 0, + 1, + &HashStats::default(), + ); } fn convert_to_slice( @@ -1940,15 +2060,6 @@ pub mod tests { input.iter().map(|v| &v[..]).collect::>() } - fn convert_to_slice2( - input: &[Vec>], - ) -> Vec> { - input - .iter() - .map(|v| v.iter().map(|v| &v[..]).collect::>()) - .collect::>() - } - #[test] #[should_panic(expected = "overflow is detected while summing capitalization")] fn test_accountsdb_lamport_overflow2() { @@ -1970,47 +2081,9 @@ pub mod tests { let dir_for_temp_cache_files = tempdir().unwrap(); let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf()); accounts_hasher.de_dup_accounts( - &[convert_to_slice(&input)], + &convert_to_slice(&input), &mut HashStats::default(), 2, // accounts above are in 2 groups ); } - - #[test] - fn test_get_binned_data() { - let data = [CalculateHashIntermediate::new( - Hash::default(), - 1, - Pubkey::from([1u8; 32]), - )]; - let data2 = vec![&data[..]]; - let bins = 1; - let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins)); - assert_eq!(result, vec![vec![&data[..]]]); - let bins = 2; - let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins)); - assert_eq!(result, vec![vec![&data[..], &data[0..0]]]); - let data = [CalculateHashIntermediate::new( - Hash::default(), - 1, - Pubkey::from([255u8; 32]), - )]; - let data2 = vec![&data[..]]; - let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins)); - assert_eq!(result, vec![vec![&data[0..0], &data[..]]]); - let data = [ - CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([254u8; 32])), - CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([255u8; 32])), - ]; - let data2 = vec![&data[..]]; - let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins)); - assert_eq!(result, vec![vec![&data[0..0], &data[..]]]); - let data = [ - CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([1u8; 32])), - CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([255u8; 32])), - ]; - let data2 = vec![&data[..]]; - let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins)); - assert_eq!(result, vec![vec![&data[0..1], &data[1..2]]]); - } } diff --git a/runtime/src/pubkey_bins.rs b/runtime/src/pubkey_bins.rs index a3655a41be..ea982f047f 100644 --- a/runtime/src/pubkey_bins.rs +++ b/runtime/src/pubkey_bins.rs @@ -11,12 +11,12 @@ impl PubkeyBinCalculator24 { std::mem::size_of::() * 8 } - pub fn log_2(x: u32) -> u32 { + pub(crate) fn log_2(x: u32) -> u32 { assert!(x > 0); Self::num_bits::() as u32 - x.leading_zeros() - 1 } - pub fn new(bins: usize) -> Self { + pub(crate) fn new(bins: usize) -> Self { const MAX_BITS: u32 = 24; assert!(bins > 0); let max_plus_1 = 1 << MAX_BITS; @@ -28,13 +28,14 @@ impl PubkeyBinCalculator24 { } } - pub fn bin_from_pubkey(&self, pubkey: &Pubkey) -> usize { + pub(crate) fn bin_from_pubkey(&self, pubkey: &Pubkey) -> usize { let as_ref = pubkey.as_ref(); ((as_ref[0] as usize * 256 + as_ref[1] as usize) * 256 + as_ref[2] as usize) >> self.shift_bits } - pub fn lowest_pubkey_from_bin(&self, mut bin: usize, bins: usize) -> Pubkey { + #[cfg(test)] + pub(crate) fn lowest_pubkey_from_bin(&self, mut bin: usize, bins: usize) -> Pubkey { assert!(bin < bins); bin <<= self.shift_bits; let mut pubkey = Pubkey::from([0; 32]);