avoid giant memory allocation in hash calc (#32646)

* avoid giant memory allocation in hash calc

* update comment

* reorder to avoid clone

* simplify references

* update comment on get_item
This commit is contained in:
Jeff Washington (jwash) 2023-07-31 13:13:19 -07:00 committed by GitHub
parent f4504d055a
commit 3dcb382731
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 268 additions and 201 deletions

View File

@ -7654,16 +7654,9 @@ impl AccountsDb {
.map(|d| d.get_cache_hash_data())
.collect::<Vec<_>>();
// rework slices of data into bins for parallel processing and to match data shape expected by 'rest_of_hash_calculation'
let result = AccountsHasher::get_binned_data(
&cache_hash_intermediates,
PUBKEY_BINS_FOR_CALCULATING_HASHES,
&bounds,
);
// turn raw data into merkle tree hashes and sum of lamports
let (accounts_hash, capitalization) =
accounts_hasher.rest_of_hash_calculation(result, &mut stats);
accounts_hasher.rest_of_hash_calculation(&cache_hash_intermediates, &mut stats);
let accounts_hash = match flavor {
CalcAccountsHashFlavor::Full => AccountsHashEnum::Full(AccountsHash(accounts_hash)),
CalcAccountsHashFlavor::Incremental => {

View File

@ -2,13 +2,13 @@ use {
crate::{
accounts_db::{AccountStorageEntry, IncludeSlotInHash, PUBKEY_BINS_FOR_CALCULATING_HASHES},
ancestors::Ancestors,
pubkey_bins::PubkeyBinCalculator24,
rent_collector::RentCollector,
},
core::ops::Range,
log::*,
memmap2::MmapMut,
rayon::prelude::*,
solana_measure::measure::Measure,
solana_measure::{measure::Measure, measure_us},
solana_sdk::{
hash::{Hash, Hasher},
pubkey::Pubkey,
@ -30,9 +30,6 @@ use {
};
pub const MERKLE_FANOUT: usize = 16;
/// the data passed through the processing functions
pub type SortedDataByPubkey<'a> = Vec<&'a [CalculateHashIntermediate]>;
/// 1 file containing account hashes sorted by pubkey, mapped into memory
struct MmapAccountHashesFile {
mmap: MmapMut,
@ -163,6 +160,7 @@ pub struct HashStats {
pub longest_ancient_scan_us: AtomicU64,
pub sum_ancient_scans_us: AtomicU64,
pub count_ancient_scans: AtomicU64,
pub pubkey_bin_search_us: AtomicU64,
}
impl HashStats {
pub fn calc_storage_size_quartiles(&mut self, storages: &[Arc<AccountStorageEntry>]) {
@ -262,6 +260,11 @@ impl HashStats {
.load(Ordering::Relaxed),
i64
),
(
"pubkey_bin_search_us",
self.pubkey_bin_search_us.load(Ordering::Relaxed),
i64
),
);
}
}
@ -767,61 +770,13 @@ impl AccountsHasher {
})
}
/// return references to cache hash data, grouped by bin, sourced from 'sorted_data_by_pubkey',
/// which is probably a mmapped file.
pub(crate) fn get_binned_data<'a>(
sorted_data_by_pubkey: &'a Vec<&'a [CalculateHashIntermediate]>,
bins: usize,
bin_range: &Range<usize>,
) -> Vec<Vec<&'a [CalculateHashIntermediate]>> {
// get slices per bin from each slice
use crate::pubkey_bins::PubkeyBinCalculator24;
let binner = PubkeyBinCalculator24::new(bins);
sorted_data_by_pubkey
.par_iter()
.map(|all_bins| {
let mut last_start_index = 0;
let mut result = Vec::with_capacity(bin_range.len());
let mut current_bin = bin_range.start;
let max_inclusive = all_bins.len();
for i in 0..=max_inclusive {
let this_bin = if i != max_inclusive {
let entry = &all_bins[i];
let this_bin = binner.bin_from_pubkey(&entry.pubkey);
if this_bin == current_bin {
// this pk is in the same bin as we're currently investigating, so keep iterating
continue;
}
this_bin
} else {
// we exhausted the source data, so 'this bin' is now the end (exclusive) bin
// this case exists to handle the +1 case
bin_range.end
};
// we found the first pubkey in the bin after the bin we were investigating
// or we passed the end of the input list.
// So, the bin we were investigating is now complete.
result.push(&all_bins[last_start_index..i]);
last_start_index = i;
((current_bin + 1)..this_bin).for_each(|_| {
// the source data could contain a pubey from bin 1, then bin 5, skipping the bins in between.
// In that case, fill in 2..5 with empty
result.push(&all_bins[0..0]); // empty slice
});
current_bin = this_bin;
}
result
})
.collect::<Vec<_>>()
}
/// returns:
/// Vec, with one entry per bin
/// for each entry, Vec<Hash> in pubkey order
/// If return Vec<AccountHashesFile> was flattened, it would be all hashes, in pubkey order.
fn de_dup_accounts<'a>(
fn de_dup_accounts(
&self,
sorted_data_by_pubkey: &'a [SortedDataByPubkey<'a>],
sorted_data_by_pubkey: &[&[CalculateHashIntermediate]],
stats: &mut HashStats,
max_bin: usize,
) -> (Vec<AccountHashesFile>, u64) {
@ -836,17 +791,14 @@ impl AccountsHasher {
let hashes: Vec<_> = (0..max_bin)
.into_par_iter()
.map(|bin| {
let (hashes_file, lamports_bin, unreduced_entries_count) =
self.de_dup_accounts_in_parallel(sorted_data_by_pubkey, bin);
let (hashes_file, lamports_bin) =
self.de_dup_accounts_in_parallel(sorted_data_by_pubkey, bin, max_bin, stats);
{
let mut lock = min_max_sum_entries_hashes.lock().unwrap();
let (mut min, mut max, mut lamports_sum, mut entries, mut hash_total) = *lock;
min = std::cmp::min(min, unreduced_entries_count);
max = std::cmp::max(max, unreduced_entries_count);
let (min, max, mut lamports_sum, entries, mut hash_total) = *lock;
lamports_sum = Self::checked_cast_for_capitalization(
lamports_sum as u128 + lamports_bin as u128,
);
entries += unreduced_entries_count;
hash_total += hashes_file.count();
*lock = (min, max, lamports_sum, entries, hash_total);
}
@ -864,42 +816,135 @@ impl AccountsHasher {
(hashes, lamports_sum)
}
// returns true if this vector was exhausted
fn get_item<'a, 'b>(
/// returns the item referenced by `min_index`
/// updates `indexes` to skip over the pubkey and its duplicates
/// updates `first_items` to point to the next pubkey
/// or removes the entire pubkey division entries (for `min_index`) if the referenced pubkey is the last entry in the same `bin`
/// removed from: `first_items`, `indexes`, and `first_item_pubkey_division`
fn get_item<'a>(
min_index: usize,
bin: usize,
first_items: &'a mut Vec<Pubkey>,
pubkey_division: &'b [SortedDataByPubkey<'b>],
indexes: &'a mut [usize],
first_item_to_pubkey_division: &'a mut Vec<usize>,
) -> &'b CalculateHashIntermediate {
first_items: &mut Vec<Pubkey>,
sorted_data_by_pubkey: &[&'a [CalculateHashIntermediate]],
indexes: &mut Vec<usize>,
first_item_to_pubkey_division: &mut Vec<usize>,
binner: &PubkeyBinCalculator24,
) -> &'a CalculateHashIntermediate {
let first_item = first_items[min_index];
let key = &first_item;
let division_index = first_item_to_pubkey_division[min_index];
let bin = &pubkey_division[division_index][bin];
let mut index = indexes[division_index];
let division_data = &sorted_data_by_pubkey[division_index];
let mut index = indexes[min_index];
index += 1;
while index < bin.len() {
let mut end;
loop {
end = index >= division_data.len();
if end {
break;
}
// still more items where we found the previous key, so just increment the index for that slot group, skipping all pubkeys that are equal
if &bin[index].pubkey == key {
let next_key = &division_data[index].pubkey;
if next_key == key {
index += 1;
continue; // duplicate entries of same pubkey, so keep skipping
}
if binner.bin_from_pubkey(next_key) > bin {
// the next pubkey is not in our bin
end = true;
break;
}
// point to the next pubkey > key
first_items[min_index] = bin[index].pubkey;
indexes[division_index] = index;
first_items[min_index] = *next_key;
indexes[min_index] = index;
break;
}
if index >= bin.len() {
if end {
// stop looking in this vector - we exhausted it
first_items.remove(min_index);
first_item_to_pubkey_division.remove(min_index);
indexes.remove(min_index);
}
// this is the previous first item that was requested
&bin[index - 1]
&division_data[index - 1]
}
/// `hash_data` must be sorted by `binner.bin_from_pubkey()`
/// return index in `hash_data` of first pubkey that is in `bin`, based on `binner`
fn binary_search_for_first_pubkey_in_bin(
hash_data: &[CalculateHashIntermediate],
bin: usize,
binner: &PubkeyBinCalculator24,
) -> Option<usize> {
let potential_index = if bin == 0 {
// `bin` == 0 is special because there cannot be `bin`-1
// so either element[0] is in bin 0 or there is nothing in bin 0.
0
} else {
// search for the first pubkey that is in `bin`
// There could be many keys in a row with the same `bin`.
// So, for each pubkey, use calculated_bin * 2 + 1 as the bin of a given pubkey for binary search.
// And compare the bin of each pubkey with `bin` * 2.
// So all keys that are in `bin` will compare as `bin` * 2 + 1
// all keys that are in `bin`-1 will compare as ((`bin` - 1) * 2 + 1), which is (`bin` * 2 - 1)
// NO keys will compare as `bin` * 2 because we add 1.
// So, the binary search will NEVER return Ok(found_index), but will always return Err(index of first key in `bin`).
// Note that if NO key is in `bin`, then the key at the found index will be in a bin > `bin`, so return None.
let just_prior_to_desired_bin = bin * 2;
let search = hash_data.binary_search_by(|data| {
(1 + 2 * binner.bin_from_pubkey(&data.pubkey)).cmp(&just_prior_to_desired_bin)
});
// returns Err(index where item should be) since the desired item will never exist
search.expect_err("it is impossible to find a matching bin")
};
// note that `potential_index` could be == hash_data.len(). This indicates the first key in `bin` would be
// after the data we have. Thus, no key is in `bin`.
// This also handles the case where `hash_data` is empty, since len() will be 0 and `get` will return None.
hash_data.get(potential_index).and_then(|potential_data| {
(binner.bin_from_pubkey(&potential_data.pubkey) == bin).then_some(potential_index)
})
}
/// `hash_data` must be sorted by `binner.bin_from_pubkey()`
/// return index in `hash_data` of first pubkey that is in `bin`, based on `binner`
fn find_first_pubkey_in_bin(
hash_data: &[CalculateHashIntermediate],
bin: usize,
bins: usize,
binner: &PubkeyBinCalculator24,
stats: &HashStats,
) -> Option<usize> {
if hash_data.is_empty() {
return None;
}
let (result, us) = measure_us!({
// assume uniform distribution of pubkeys and choose first guess based on bin we're looking for
let i = hash_data.len() * bin / bins;
let estimate = &hash_data[i];
let pubkey_bin = binner.bin_from_pubkey(&estimate.pubkey);
let range = if pubkey_bin >= bin {
// i pubkey matches or is too large, so look <= i for the first pubkey in the right bin
// i+1 could be the first pubkey in the right bin
0..(i + 1)
} else {
// i pubkey is too small, so look after i
(i + 1)..hash_data.len()
};
Some(
range.start +
// binary search the subset
Self::binary_search_for_first_pubkey_in_bin(
&hash_data[range],
bin,
binner,
)?,
)
});
stats.pubkey_bin_search_us.fetch_add(us, Ordering::Relaxed);
result
}
// go through: [..][pubkey_bin][..] and return hashes and lamport sum
@ -909,35 +954,39 @@ impl AccountsHasher {
// 3. produce this output:
// a. AccountHashesFile: individual account hashes in pubkey order
// b. lamport sum
// c. unreduced count (ie. including duplicates and zero lamport)
fn de_dup_accounts_in_parallel<'a>(
fn de_dup_accounts_in_parallel(
&self,
pubkey_division: &'a [SortedDataByPubkey<'a>],
sorted_data_by_pubkey: &[&[CalculateHashIntermediate]],
pubkey_bin: usize,
) -> (AccountHashesFile, u64, usize) {
let len = pubkey_division.len();
let mut unreduced_count = 0;
let mut indexes = vec![0; len];
bins: usize,
stats: &HashStats,
) -> (AccountHashesFile, u64) {
let binner = PubkeyBinCalculator24::new(bins);
let len = sorted_data_by_pubkey.len();
let mut indexes = Vec::with_capacity(len);
let mut first_items = Vec::with_capacity(len);
// map from index of an item in first_items[] to index of the corresponding item in pubkey_division[]
// this will change as items in pubkey_division[] are exhausted
// map from index of an item in first_items[] to index of the corresponding item in sorted_data_by_pubkey[]
// this will change as items in sorted_data_by_pubkey[] are exhausted
let mut first_item_to_pubkey_division = Vec::with_capacity(len);
let mut hashes = AccountHashesFile {
count_and_writer: None,
dir_for_temp_cache_files: self.dir_for_temp_cache_files.clone(),
};
// initialize 'first_items', which holds the current lowest item in each slot group
pubkey_division.iter().enumerate().for_each(|(i, bins)| {
// check to make sure we can do bins[pubkey_bin]
if bins.len() > pubkey_bin {
let sub = bins[pubkey_bin];
if !sub.is_empty() {
unreduced_count += bins[pubkey_bin].len(); // sum for metrics
first_items.push(bins[pubkey_bin][0].pubkey);
sorted_data_by_pubkey
.iter()
.enumerate()
.for_each(|(i, hash_data)| {
let first_pubkey_in_bin =
Self::find_first_pubkey_in_bin(hash_data, pubkey_bin, bins, &binner, stats);
if let Some(first_pubkey_in_bin) = first_pubkey_in_bin {
let k = hash_data[first_pubkey_in_bin].pubkey;
first_items.push(k);
first_item_to_pubkey_division.push(i);
indexes.push(first_pubkey_in_bin);
}
}
});
});
let mut overall_sum = 0;
let mut duplicate_pubkey_indexes = Vec::with_capacity(len);
let filler_accounts_enabled = self.filler_accounts_enabled();
@ -976,9 +1025,10 @@ impl AccountsHasher {
min_index,
pubkey_bin,
&mut first_items,
pubkey_division,
sorted_data_by_pubkey,
&mut indexes,
&mut first_item_to_pubkey_division,
&binner,
);
// add lamports and get hash
@ -1010,15 +1060,17 @@ impl AccountsHasher {
*i,
pubkey_bin,
&mut first_items,
pubkey_division,
sorted_data_by_pubkey,
&mut indexes,
&mut first_item_to_pubkey_division,
&binner,
);
});
duplicate_pubkey_indexes.clear();
}
}
(hashes, overall_sum, unreduced_count)
(hashes, overall_sum)
}
fn is_filler_account(&self, pubkey: &Pubkey) -> bool {
@ -1030,15 +1082,14 @@ impl AccountsHasher {
// input:
// vec: group of slot data, ordered by Slot (low to high)
// vec: [0..bins] - where bins are pubkey ranges (these are ordered by Pubkey range)
// vec: [..] - items which fit in the containing bin. Sorted by: Pubkey, higher Slot, higher Write version (if pubkey =)
// vec: [..] - items which fit in the containing bin. Sorted by: Pubkey, higher Slot, higher Write version (if pubkey =)
pub fn rest_of_hash_calculation(
&self,
data_sections_by_pubkey: Vec<SortedDataByPubkey<'_>>,
sorted_data_by_pubkey: &[&[CalculateHashIntermediate]],
stats: &mut HashStats,
) -> (Hash, u64) {
let (hashes, total_lamports) = self.de_dup_accounts(
&data_sections_by_pubkey,
sorted_data_by_pubkey,
stats,
PUBKEY_BINS_FOR_CALCULATING_HASHES,
);
@ -1105,7 +1156,7 @@ pub struct AccountsDeltaHash(pub Hash);
#[cfg(test)]
pub mod tests {
use {super::*, std::str::FromStr, tempfile::tempdir};
use {super::*, itertools::Itertools, std::str::FromStr, tempfile::tempdir};
impl AccountsHasher {
fn new(dir_for_temp_cache_files: PathBuf) -> Self {
@ -1126,6 +1177,59 @@ pub mod tests {
}
}
#[test]
fn test_find_first_pubkey_in_bin() {
let stats = HashStats::default();
for (bins, expected_count) in [1, 2, 4].into_iter().zip([5, 20, 120]) {
let bins: usize = bins;
let binner = PubkeyBinCalculator24::new(bins);
let mut count = 0usize;
// # pubkeys in each bin are permutations of these
// 0 means none in this bin
// large number (20) means the found key will be well before or after the expected index based on an assumption of uniform distribution
for counts in [0, 1, 2, 20, 0].into_iter().permutations(bins) {
count += 1;
let hash_data = counts
.iter()
.enumerate()
.flat_map(|(bin, count)| {
(0..*count).map(move |_| {
let binner = PubkeyBinCalculator24::new(bins);
CalculateHashIntermediate::new(
Hash::default(),
0,
binner.lowest_pubkey_from_bin(bin, bins),
)
})
})
.collect::<Vec<_>>();
// look for the first pubkey in each bin
for (bin, count_in_bin) in counts.iter().enumerate().take(bins) {
let first = AccountsHasher::find_first_pubkey_in_bin(
&hash_data, bin, bins, &binner, &stats,
);
// test both functions
let first_again = AccountsHasher::binary_search_for_first_pubkey_in_bin(
&hash_data, bin, &binner,
);
assert_eq!(first, first_again);
assert_eq!(first.is_none(), count_in_bin == &0);
if let Some(first) = first {
assert_eq!(binner.bin_from_pubkey(&hash_data[first].pubkey), bin);
if first > 0 {
assert!(binner.bin_from_pubkey(&hash_data[first - 1].pubkey) < bin);
}
}
}
}
assert_eq!(
count, expected_count,
"too few iterations in test. bins: {bins}"
);
}
}
#[test]
fn test_account_hashes_file() {
let dir_for_temp_cache_files = tempdir().unwrap();
@ -1234,8 +1338,8 @@ pub mod tests {
assert_eq!(AccountsHasher::div_ceil(10, 0), 0);
}
fn for_rest(original: &[CalculateHashIntermediate]) -> Vec<SortedDataByPubkey<'_>> {
vec![vec![original]]
fn for_rest(original: &[CalculateHashIntermediate]) -> Vec<&[CalculateHashIntermediate]> {
vec![original]
}
#[test]
@ -1258,7 +1362,7 @@ pub mod tests {
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hash = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
let result = accounts_hash
.rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default());
.rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default());
let expected_hash = Hash::from_str("8j9ARGFv4W2GfML7d3sVJK2MePwrikqYnu6yqer28cCa").unwrap();
assert_eq!((result.0, result.1), (expected_hash, 88));
@ -1269,7 +1373,7 @@ pub mod tests {
account_maps.insert(0, val);
let result = accounts_hash
.rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default());
.rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default());
let expected_hash = Hash::from_str("EHv9C5vX7xQjjMpsJMzudnDTzoTSRwYkqLzY8tVMihGj").unwrap();
assert_eq!((result.0, result.1), (expected_hash, 108));
@ -1280,7 +1384,7 @@ pub mod tests {
account_maps.insert(1, val);
let result = accounts_hash
.rest_of_hash_calculation(for_rest(&account_maps), &mut HashStats::default());
.rest_of_hash_calculation(&for_rest(&account_maps), &mut HashStats::default());
let expected_hash = Hash::from_str("7NNPg5A8Xsg1uv4UFm6KZNwsipyyUnmgCrznP6MBWoBZ").unwrap();
assert_eq!((result.0, result.1), (expected_hash, 118));
}
@ -1295,15 +1399,16 @@ pub mod tests {
#[test]
fn test_accountsdb_de_dup_accounts_zero_chunks() {
let vec = [vec![vec![CalculateHashIntermediate {
let vec = vec![vec![CalculateHashIntermediate {
lamports: 1,
..CalculateHashIntermediate::default()
}]]];
}]];
let temp_vec = vec.to_vec();
let slice = convert_to_slice2(&temp_vec);
let slice = convert_to_slice(&temp_vec);
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
let (mut hashes, lamports, _) = accounts_hasher.de_dup_accounts_in_parallel(&slice, 0);
let (mut hashes, lamports) =
accounts_hasher.de_dup_accounts_in_parallel(&slice, 0, 1, &HashStats::default());
assert_eq!(&[Hash::default()], hashes.get_reader().unwrap().1.read(0));
assert_eq!(lamports, 1);
}
@ -1324,9 +1429,10 @@ pub mod tests {
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hash = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
let vec = vec![vec![], vec![]];
let empty = [];
let vec = &empty;
let (hashes, lamports) =
accounts_hash.de_dup_accounts(&vec, &mut HashStats::default(), one_range());
accounts_hash.de_dup_accounts(vec, &mut HashStats::default(), one_range());
assert_eq!(
vec![Hash::default(); 0],
get_vec_vec(hashes)
@ -1342,11 +1448,13 @@ pub mod tests {
assert_eq!(empty, get_vec_vec(hashes));
assert_eq!(lamports, 0);
let (hashes, lamports, _) = accounts_hash.de_dup_accounts_in_parallel(&[], 1);
let (hashes, lamports) =
accounts_hash.de_dup_accounts_in_parallel(&[], 1, 1, &HashStats::default());
assert_eq!(vec![Hash::default(); 0], get_vec(hashes));
assert_eq!(lamports, 0);
let (hashes, lamports, _) = accounts_hash.de_dup_accounts_in_parallel(&[], 2);
let (hashes, lamports) =
accounts_hash.de_dup_accounts_in_parallel(&[], 2, 1, &HashStats::default());
assert_eq!(vec![Hash::default(); 0], get_vec(hashes));
assert_eq!(lamports, 0);
}
@ -1427,24 +1535,31 @@ pub mod tests {
let accounts = accounts.clone();
let slice = &accounts[start..end];
let slice2 = vec![vec![slice.to_vec()]];
let slice2 = vec![slice.to_vec()];
let slice = &slice2[..];
let slice_temp = convert_to_slice2(&slice2);
let (hashes2, lamports2, _) = hash.de_dup_accounts_in_parallel(&slice_temp, 0);
let slice3 = convert_to_slice2(&slice2);
let (hashes3, lamports3, _) = hash.de_dup_accounts_in_parallel(&slice3, 0);
let slice_temp = convert_to_slice(&slice2);
let (hashes2, lamports2) =
hash.de_dup_accounts_in_parallel(&slice_temp, 0, 1, &HashStats::default());
let slice3 = convert_to_slice(&slice2);
let (hashes3, lamports3) =
hash.de_dup_accounts_in_parallel(&slice3, 0, 1, &HashStats::default());
let vec = slice.to_vec();
let slice4 = convert_to_slice2(&vec);
let slice4 = convert_to_slice(&vec);
let mut max_bin = end - start;
if !max_bin.is_power_of_two() {
max_bin = 1;
}
let (hashes4, lamports4) =
hash.de_dup_accounts(&slice4, &mut HashStats::default(), end - start);
hash.de_dup_accounts(&slice4, &mut HashStats::default(), max_bin);
let vec = slice.to_vec();
let slice5 = convert_to_slice2(&vec);
let slice5 = convert_to_slice(&vec);
let (hashes5, lamports5) =
hash.de_dup_accounts(&slice5, &mut HashStats::default(), end - start);
hash.de_dup_accounts(&slice5, &mut HashStats::default(), max_bin);
let vec = slice.to_vec();
let slice5 = convert_to_slice2(&vec);
let slice5 = convert_to_slice(&vec);
let (hashes6, lamports6) =
hash.de_dup_accounts(&slice5, &mut HashStats::default(), end - start);
hash.de_dup_accounts(&slice5, &mut HashStats::default(), max_bin);
let hashes2 = get_vec(hashes2);
let hashes3 = get_vec(hashes3);
@ -1473,7 +1588,7 @@ pub mod tests {
assert_eq!(lamports2, lamports5);
assert_eq!(lamports2, lamports6);
let human_readable = slice[0][0]
let human_readable = slice[0]
.iter()
.map(|v| {
let mut s = (if v.pubkey == key_a {
@ -1549,11 +1664,11 @@ pub mod tests {
}
fn test_de_dup_accounts_in_parallel<'a>(
account_maps: &'a [SortedDataByPubkey<'a>],
) -> (AccountHashesFile, u64, usize) {
account_maps: &'a [&'a [CalculateHashIntermediate]],
) -> (AccountHashesFile, u64) {
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
accounts_hasher.de_dup_accounts_in_parallel(account_maps, 0)
accounts_hasher.de_dup_accounts_in_parallel(account_maps, 0, 1, &HashStats::default())
}
#[test]
@ -1566,22 +1681,22 @@ pub mod tests {
let val = CalculateHashIntermediate::new(hash, 1, key);
account_maps.push(val.clone());
let vecs = vec![vec![account_maps.to_vec()]];
let slice = convert_to_slice2(&vecs);
let (hashfile, lamports, count) = test_de_dup_accounts_in_parallel(&slice);
let vecs = vec![account_maps.to_vec()];
let slice = convert_to_slice(&vecs);
let (hashfile, lamports) = test_de_dup_accounts_in_parallel(&slice);
assert_eq!(
(get_vec(hashfile), lamports, count),
(vec![val.hash], val.lamports, 1)
(get_vec(hashfile), lamports),
(vec![val.hash], val.lamports)
);
// zero original lamports, higher version
let val = CalculateHashIntermediate::new(hash, 0, key);
account_maps.push(val); // has to be after previous entry since account_maps are in slot order
let vecs = vec![vec![account_maps.to_vec()]];
let slice = convert_to_slice2(&vecs);
let (hashfile, lamports, count) = test_de_dup_accounts_in_parallel(&slice);
assert_eq!((get_vec(hashfile), lamports, count), (vec![], 0, 2));
let vecs = vec![account_maps.to_vec()];
let slice = convert_to_slice(&vecs);
let (hashfile, lamports) = test_de_dup_accounts_in_parallel(&slice);
assert_eq!((get_vec(hashfile), lamports), (vec![], 0));
}
#[test]
@ -1931,7 +2046,12 @@ pub mod tests {
];
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
accounts_hasher.de_dup_accounts_in_parallel(&[convert_to_slice(&[input])], 0);
accounts_hasher.de_dup_accounts_in_parallel(
&convert_to_slice(&[input]),
0,
1,
&HashStats::default(),
);
}
fn convert_to_slice(
@ -1940,15 +2060,6 @@ pub mod tests {
input.iter().map(|v| &v[..]).collect::<Vec<_>>()
}
fn convert_to_slice2(
input: &[Vec<Vec<CalculateHashIntermediate>>],
) -> Vec<Vec<&[CalculateHashIntermediate]>> {
input
.iter()
.map(|v| v.iter().map(|v| &v[..]).collect::<Vec<_>>())
.collect::<Vec<_>>()
}
#[test]
#[should_panic(expected = "overflow is detected while summing capitalization")]
fn test_accountsdb_lamport_overflow2() {
@ -1970,47 +2081,9 @@ pub mod tests {
let dir_for_temp_cache_files = tempdir().unwrap();
let accounts_hasher = AccountsHasher::new(dir_for_temp_cache_files.path().to_path_buf());
accounts_hasher.de_dup_accounts(
&[convert_to_slice(&input)],
&convert_to_slice(&input),
&mut HashStats::default(),
2, // accounts above are in 2 groups
);
}
#[test]
fn test_get_binned_data() {
let data = [CalculateHashIntermediate::new(
Hash::default(),
1,
Pubkey::from([1u8; 32]),
)];
let data2 = vec![&data[..]];
let bins = 1;
let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins));
assert_eq!(result, vec![vec![&data[..]]]);
let bins = 2;
let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins));
assert_eq!(result, vec![vec![&data[..], &data[0..0]]]);
let data = [CalculateHashIntermediate::new(
Hash::default(),
1,
Pubkey::from([255u8; 32]),
)];
let data2 = vec![&data[..]];
let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins));
assert_eq!(result, vec![vec![&data[0..0], &data[..]]]);
let data = [
CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([254u8; 32])),
CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([255u8; 32])),
];
let data2 = vec![&data[..]];
let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins));
assert_eq!(result, vec![vec![&data[0..0], &data[..]]]);
let data = [
CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([1u8; 32])),
CalculateHashIntermediate::new(Hash::default(), 1, Pubkey::from([255u8; 32])),
];
let data2 = vec![&data[..]];
let result = AccountsHasher::get_binned_data(&data2, bins, &(0..bins));
assert_eq!(result, vec![vec![&data[0..1], &data[1..2]]]);
}
}

View File

@ -11,12 +11,12 @@ impl PubkeyBinCalculator24 {
std::mem::size_of::<T>() * 8
}
pub fn log_2(x: u32) -> u32 {
pub(crate) fn log_2(x: u32) -> u32 {
assert!(x > 0);
Self::num_bits::<u32>() as u32 - x.leading_zeros() - 1
}
pub fn new(bins: usize) -> Self {
pub(crate) fn new(bins: usize) -> Self {
const MAX_BITS: u32 = 24;
assert!(bins > 0);
let max_plus_1 = 1 << MAX_BITS;
@ -28,13 +28,14 @@ impl PubkeyBinCalculator24 {
}
}
pub fn bin_from_pubkey(&self, pubkey: &Pubkey) -> usize {
pub(crate) fn bin_from_pubkey(&self, pubkey: &Pubkey) -> usize {
let as_ref = pubkey.as_ref();
((as_ref[0] as usize * 256 + as_ref[1] as usize) * 256 + as_ref[2] as usize)
>> self.shift_bits
}
pub fn lowest_pubkey_from_bin(&self, mut bin: usize, bins: usize) -> Pubkey {
#[cfg(test)]
pub(crate) fn lowest_pubkey_from_bin(&self, mut bin: usize, bins: usize) -> Pubkey {
assert!(bin < bins);
bin <<= self.shift_bits;
let mut pubkey = Pubkey::from([0; 32]);