Deshred blocks in parallel (#6461)
* Deshred in parallel * Add tests for corrupt slots and parallel deshred * Rename load_blocktree_entries to load_blocktree_entries_with_shred_count
This commit is contained in:
parent
8319fa05d0
commit
b38bf90de7
|
@ -298,7 +298,7 @@ mod test {
|
|||
);
|
||||
|
||||
let blocktree = broadcast_service.blocktree;
|
||||
let (entries, _, _, _) = blocktree
|
||||
let (entries, _) = blocktree
|
||||
.get_slot_entries_with_shred_count(slot, 0)
|
||||
.expect("Expect entries to be present");
|
||||
assert_eq!(entries.len(), max_tick_height as usize);
|
||||
|
|
|
@ -415,25 +415,25 @@ impl ReplayStage {
|
|||
.entry(bank.slot())
|
||||
.or_insert_with(|| ForkProgress::new(bank.slot(), bank.last_blockhash()));
|
||||
let now = Instant::now();
|
||||
let load_result = Self::load_blocktree_entries(bank, blocktree, bank_progress);
|
||||
let load_result =
|
||||
Self::load_blocktree_entries_with_shred_count(bank, blocktree, bank_progress);
|
||||
let fetch_entries_elapsed = now.elapsed().as_micros();
|
||||
|
||||
if load_result.is_err() {
|
||||
bank_progress.stats.fetch_entries_fail_elapsed += fetch_entries_elapsed as u64;
|
||||
} else {
|
||||
bank_progress.stats.fetch_entries_elapsed += fetch_entries_elapsed as u64;
|
||||
}
|
||||
let replay_result =
|
||||
load_result.and_then(|(entries, num_shreds, useful_time, wasted_time)| {
|
||||
trace!(
|
||||
"Fetch entries for slot {}, {:?} entries, num shreds {:?}",
|
||||
bank.slot(),
|
||||
entries.len(),
|
||||
num_shreds
|
||||
);
|
||||
tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>();
|
||||
bank_progress.stats.fetch_entries_elapsed += useful_time as u64;
|
||||
bank_progress.stats.fetch_entries_fail_elapsed += wasted_time as u64;
|
||||
Self::replay_entries_into_bank(bank, entries, bank_progress, num_shreds)
|
||||
});
|
||||
|
||||
let replay_result = load_result.and_then(|(entries, num_shreds)| {
|
||||
trace!(
|
||||
"Fetch entries for slot {}, {:?} entries, num shreds {:?}",
|
||||
bank.slot(),
|
||||
entries.len(),
|
||||
num_shreds
|
||||
);
|
||||
tx_count += entries.iter().map(|e| e.transactions.len()).sum::<usize>();
|
||||
Self::replay_entries_into_bank(bank, entries, bank_progress, num_shreds)
|
||||
});
|
||||
|
||||
if Self::is_replay_result_fatal(&replay_result) {
|
||||
warn!(
|
||||
|
@ -726,15 +726,15 @@ impl ReplayStage {
|
|||
});
|
||||
}
|
||||
|
||||
fn load_blocktree_entries(
|
||||
fn load_blocktree_entries_with_shred_count(
|
||||
bank: &Bank,
|
||||
blocktree: &Blocktree,
|
||||
bank_progress: &mut ForkProgress,
|
||||
) -> Result<(Vec<Entry>, usize, u64, u64)> {
|
||||
) -> Result<(Vec<Entry>, usize)> {
|
||||
let bank_slot = bank.slot();
|
||||
let entries_and_count = blocktree
|
||||
let entries_and_shred_count = blocktree
|
||||
.get_slot_entries_with_shred_count(bank_slot, bank_progress.num_shreds as u64)?;
|
||||
Ok(entries_and_count)
|
||||
Ok(entries_and_shred_count)
|
||||
}
|
||||
|
||||
fn replay_entries_into_bank(
|
||||
|
@ -766,6 +766,7 @@ impl ReplayStage {
|
|||
) -> Result<()> {
|
||||
let now = Instant::now();
|
||||
let last_entry = &bank_progress.last_entry;
|
||||
datapoint_info!("verify-batch-size", ("size", entries.len() as i64, i64));
|
||||
let verify_result = entries.verify(last_entry);
|
||||
let verify_entries_elapsed = now.elapsed().as_micros();
|
||||
bank_progress.stats.entry_verification_elapsed += verify_entries_elapsed as u64;
|
||||
|
|
|
@ -7,11 +7,14 @@ use crate::shred::{Shred, Shredder};
|
|||
|
||||
use bincode::deserialize;
|
||||
|
||||
use std::collections::HashMap;
|
||||
|
||||
use log::*;
|
||||
use rayon::iter::IntoParallelRefIterator;
|
||||
use rayon::iter::ParallelIterator;
|
||||
use rayon::ThreadPool;
|
||||
use rocksdb;
|
||||
|
||||
use solana_metrics::{datapoint_debug, datapoint_error};
|
||||
use solana_rayon_threadlimit::get_thread_count;
|
||||
|
||||
use solana_sdk::genesis_block::GenesisBlock;
|
||||
use solana_sdk::hash::Hash;
|
||||
|
@ -19,13 +22,13 @@ use solana_sdk::signature::{Keypair, KeypairUtil};
|
|||
|
||||
use std::cell::RefCell;
|
||||
use std::cmp;
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::rc::Rc;
|
||||
use std::result;
|
||||
use std::sync::mpsc::{sync_channel, Receiver, SyncSender, TrySendError};
|
||||
use std::sync::{Arc, RwLock};
|
||||
use std::time::Instant;
|
||||
|
||||
pub use self::meta::*;
|
||||
use crate::leader_schedule_cache::LeaderScheduleCache;
|
||||
|
@ -45,6 +48,11 @@ type BatchProcessor = db::BatchProcessor;
|
|||
|
||||
pub const BLOCKTREE_DIRECTORY: &str = "rocksdb";
|
||||
|
||||
thread_local!(static PAR_THREAD_POOL: RefCell<ThreadPool> = RefCell::new(rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(get_thread_count())
|
||||
.build()
|
||||
.unwrap()));
|
||||
|
||||
pub const MAX_COMPLETED_SLOTS_IN_CHANNEL: usize = 100_000;
|
||||
|
||||
pub type SlotMetaWorkingSetEntry = (Rc<RefCell<SlotMeta>>, Option<SlotMeta>);
|
||||
|
@ -727,6 +735,13 @@ impl Blocktree {
|
|||
false
|
||||
};
|
||||
|
||||
let last_in_data = if shred.data_complete() {
|
||||
debug!("got last in data");
|
||||
true
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if is_orphan(slot_meta) {
|
||||
slot_meta.parent_slot = parent;
|
||||
}
|
||||
|
@ -754,7 +769,13 @@ impl Blocktree {
|
|||
// Commit step: commit all changes to the mutable structures at once, or none at all.
|
||||
// We don't want only a subset of these changes going through.
|
||||
write_batch.put_bytes::<cf::ShredData>((slot, index), &shred.payload)?;
|
||||
update_slot_meta(last_in_slot, slot_meta, index, new_consumed);
|
||||
update_slot_meta(
|
||||
last_in_slot,
|
||||
last_in_data,
|
||||
slot_meta,
|
||||
index as u32,
|
||||
new_consumed,
|
||||
);
|
||||
data_index.set_present(index, true);
|
||||
trace!("inserted shred into slot {:?} and index {:?}", slot, index);
|
||||
Ok(())
|
||||
|
@ -991,81 +1012,111 @@ impl Blocktree {
|
|||
pub fn get_slot_entries_with_shred_count(
|
||||
&self,
|
||||
slot: u64,
|
||||
mut start_index: u64,
|
||||
) -> Result<(Vec<Entry>, usize, u64, u64)> {
|
||||
let mut useful_time = 0;
|
||||
let mut wasted_time = 0;
|
||||
|
||||
let mut all_entries = vec![];
|
||||
let mut num_shreds = 0;
|
||||
loop {
|
||||
let now = Instant::now();
|
||||
let mut res = self.get_entries_in_data_block(slot, &mut start_index);
|
||||
let elapsed = now.elapsed().as_micros();
|
||||
|
||||
if let Ok((ref mut entries, new_num_shreds)) = res {
|
||||
if !entries.is_empty() {
|
||||
all_entries.append(entries);
|
||||
num_shreds += new_num_shreds;
|
||||
useful_time += elapsed;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// All unsuccessful cases (errors, incomplete data blocks) will count as wasted work
|
||||
wasted_time += elapsed;
|
||||
res?;
|
||||
break;
|
||||
start_index: u64,
|
||||
) -> Result<(Vec<Entry>, usize)> {
|
||||
let slot_meta_cf = self.db.column::<cf::SlotMeta>();
|
||||
let slot_meta = slot_meta_cf.get(slot)?;
|
||||
if slot_meta.is_none() {
|
||||
return Ok((vec![], 0));
|
||||
}
|
||||
|
||||
trace!("Found {:?} entries", all_entries.len());
|
||||
Ok((
|
||||
all_entries,
|
||||
num_shreds,
|
||||
useful_time as u64,
|
||||
wasted_time as u64,
|
||||
))
|
||||
let slot_meta = slot_meta.unwrap();
|
||||
// Find all the ranges for the completed data blocks
|
||||
let completed_ranges = Self::get_completed_data_ranges(
|
||||
start_index as u32,
|
||||
&slot_meta.completed_data_indexes[..],
|
||||
slot_meta.consumed as u32,
|
||||
);
|
||||
if completed_ranges.is_empty() {
|
||||
return Ok((vec![], 0));
|
||||
}
|
||||
let num_shreds = completed_ranges
|
||||
.last()
|
||||
.map(|(_, end_index)| u64::from(*end_index) - start_index + 1);
|
||||
|
||||
let all_entries: Result<Vec<Vec<Entry>>> = PAR_THREAD_POOL.with(|thread_pool| {
|
||||
thread_pool.borrow().install(|| {
|
||||
completed_ranges
|
||||
.par_iter()
|
||||
.map(|(start_index, end_index)| {
|
||||
self.get_entries_in_data_block(slot, *start_index, *end_index)
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
});
|
||||
|
||||
let all_entries: Vec<Entry> = all_entries?.into_iter().flatten().collect();
|
||||
Ok((all_entries, num_shreds.unwrap_or(0) as usize))
|
||||
}
|
||||
|
||||
pub fn get_entries_in_data_block(
|
||||
&self,
|
||||
slot: u64,
|
||||
start_index: &mut u64,
|
||||
) -> Result<(Vec<Entry>, usize)> {
|
||||
let mut shred_chunk: Vec<Shred> = vec![];
|
||||
let data_shred_cf = self.db.column::<cf::ShredData>();
|
||||
while let Some(serialized_shred) = data_shred_cf.get_bytes((slot, *start_index))? {
|
||||
*start_index += 1;
|
||||
let new_shred = Shred::new_from_serialized_shred(serialized_shred).ok();
|
||||
if let Some(shred) = new_shred {
|
||||
let is_complete = shred.data_complete() || shred.last_in_slot();
|
||||
shred_chunk.push(shred);
|
||||
if is_complete {
|
||||
if let Ok(deshred_payload) = Shredder::deshred(&shred_chunk) {
|
||||
debug!("{:?} shreds in last FEC set", shred_chunk.len(),);
|
||||
let entries: Vec<Entry> =
|
||||
bincode::deserialize(&deshred_payload).map_err(|_| {
|
||||
BlocktreeError::InvalidShredData(Box::new(
|
||||
bincode::ErrorKind::Custom(
|
||||
"could not construct entries".to_string(),
|
||||
),
|
||||
))
|
||||
})?;
|
||||
return Ok((entries, shred_chunk.len()));
|
||||
} else {
|
||||
debug!("Failed in deshredding shred payloads");
|
||||
break;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Didn't find a valid shred, this slot is dead.
|
||||
// TODO: Mark as dead, but have to carefully handle last shred of interrupted
|
||||
// slots.
|
||||
break;
|
||||
// Get the range of indexes [start_index, end_index] of every completed data block
|
||||
fn get_completed_data_ranges(
|
||||
mut start_index: u32,
|
||||
completed_data_end_indexes: &[u32],
|
||||
consumed: u32,
|
||||
) -> Vec<(u32, u32)> {
|
||||
let mut completed_data_ranges = vec![];
|
||||
let floor = completed_data_end_indexes
|
||||
.iter()
|
||||
.position(|i| *i >= start_index)
|
||||
.unwrap_or_else(|| completed_data_end_indexes.len());
|
||||
|
||||
for i in &completed_data_end_indexes[floor as usize..] {
|
||||
// `consumed` is the next missing shred index, but shred `i` existing in
|
||||
// completed_data_end_indexes implies it's not missing
|
||||
assert!(*i != consumed);
|
||||
|
||||
if *i < consumed {
|
||||
completed_data_ranges.push((start_index, *i));
|
||||
start_index = *i + 1;
|
||||
}
|
||||
}
|
||||
|
||||
Ok((vec![], 0))
|
||||
completed_data_ranges
|
||||
}
|
||||
|
||||
fn get_entries_in_data_block(
|
||||
&self,
|
||||
slot: u64,
|
||||
start_index: u32,
|
||||
end_index: u32,
|
||||
) -> Result<Vec<Entry>> {
|
||||
let data_shred_cf = self.db.column::<cf::ShredData>();
|
||||
|
||||
// Short circuit on first error
|
||||
let data_shreds: Result<Vec<Shred>> = (start_index..=end_index)
|
||||
.map(|i| {
|
||||
data_shred_cf
|
||||
.get_bytes((slot, u64::from(i)))
|
||||
.and_then(|serialized_shred| {
|
||||
Shred::new_from_serialized_shred(
|
||||
serialized_shred
|
||||
.expect("Shred must exist if shred index was included in a range"),
|
||||
)
|
||||
.map_err(|_| {
|
||||
BlocktreeError::InvalidShredData(Box::new(bincode::ErrorKind::Custom(
|
||||
"Could not reconstruct shred from shred payload".to_string(),
|
||||
)))
|
||||
})
|
||||
})
|
||||
})
|
||||
.collect();
|
||||
|
||||
let data_shreds = data_shreds?;
|
||||
assert!(data_shreds.last().unwrap().data_complete());
|
||||
|
||||
let deshred_payload = Shredder::deshred(&data_shreds).map_err(|_| {
|
||||
BlocktreeError::InvalidShredData(Box::new(bincode::ErrorKind::Custom(
|
||||
"Could not reconstruct data block from constituent shreds".to_string(),
|
||||
)))
|
||||
})?;
|
||||
|
||||
debug!("{:?} shreds in last FEC set", data_shreds.len(),);
|
||||
bincode::deserialize::<Vec<Entry>>(&deshred_payload).map_err(|_| {
|
||||
BlocktreeError::InvalidShredData(Box::new(bincode::ErrorKind::Custom(
|
||||
"could not reconstruct entries".to_string(),
|
||||
)))
|
||||
})
|
||||
}
|
||||
|
||||
// Returns slots connecting to any element of the list `slots`.
|
||||
|
@ -1198,20 +1249,21 @@ impl Blocktree {
|
|||
|
||||
fn update_slot_meta(
|
||||
is_last_in_slot: bool,
|
||||
is_last_in_data: bool,
|
||||
slot_meta: &mut SlotMeta,
|
||||
index: u64,
|
||||
index: u32,
|
||||
new_consumed: u64,
|
||||
) {
|
||||
// Index is zero-indexed, while the "received" height starts from 1,
|
||||
// so received = index + 1 for the same shred.
|
||||
slot_meta.received = cmp::max(index + 1, slot_meta.received);
|
||||
slot_meta.received = cmp::max((u64::from(index) + 1) as u64, slot_meta.received);
|
||||
slot_meta.consumed = new_consumed;
|
||||
slot_meta.last_index = {
|
||||
// If the last index in the slot hasn't been set before, then
|
||||
// set it to this shred index
|
||||
if slot_meta.last_index == std::u64::MAX {
|
||||
if is_last_in_slot {
|
||||
index
|
||||
u64::from(index)
|
||||
} else {
|
||||
std::u64::MAX
|
||||
}
|
||||
|
@ -1219,6 +1271,16 @@ fn update_slot_meta(
|
|||
slot_meta.last_index
|
||||
}
|
||||
};
|
||||
|
||||
if is_last_in_slot || is_last_in_data {
|
||||
let position = slot_meta
|
||||
.completed_data_indexes
|
||||
.iter()
|
||||
.position(|completed_data_index| *completed_data_index > index)
|
||||
.unwrap_or_else(|| slot_meta.completed_data_indexes.len());
|
||||
|
||||
slot_meta.completed_data_indexes.insert(position, index);
|
||||
}
|
||||
}
|
||||
|
||||
fn get_index_meta_entry<'a>(
|
||||
|
@ -2045,7 +2107,8 @@ pub mod tests {
|
|||
|
||||
#[test]
|
||||
fn test_insert_data_shreds_reverse() {
|
||||
let num_entries = 10;
|
||||
let num_shreds = 10;
|
||||
let num_entries = max_ticks_per_n_shreds(num_shreds);
|
||||
let (mut shreds, entries) = make_slot_entries(0, 0, num_entries);
|
||||
let num_shreds = shreds.len() as u64;
|
||||
|
||||
|
@ -2938,7 +3001,7 @@ pub mod tests {
|
|||
shred
|
||||
.iter_mut()
|
||||
.enumerate()
|
||||
.for_each(|(i, shred)| shred.set_index(slot as u32 + i as u32));
|
||||
.for_each(|(_, shred)| shred.set_index(0));
|
||||
shreds.extend(shred);
|
||||
entries.extend(entry);
|
||||
}
|
||||
|
@ -2956,16 +3019,16 @@ pub mod tests {
|
|||
|
||||
for i in 0..num_entries - 1 {
|
||||
assert_eq!(
|
||||
blocktree.get_slot_entries(i, i, None).unwrap()[0],
|
||||
blocktree.get_slot_entries(i, 0, None).unwrap()[0],
|
||||
entries[i as usize]
|
||||
);
|
||||
|
||||
let meta = blocktree.meta(i).unwrap().unwrap();
|
||||
assert_eq!(meta.received, i + num_shreds_per_slot);
|
||||
assert_eq!(meta.last_index, i + num_shreds_per_slot - 1);
|
||||
assert_eq!(meta.received, 1);
|
||||
assert_eq!(meta.last_index, 0);
|
||||
if i != 0 {
|
||||
assert_eq!(meta.parent_slot, i - 1);
|
||||
assert_eq!(meta.consumed, 0);
|
||||
assert_eq!(meta.consumed, 1);
|
||||
} else {
|
||||
assert_eq!(meta.parent_slot, 0);
|
||||
assert_eq!(meta.consumed, num_shreds_per_slot);
|
||||
|
@ -3567,4 +3630,102 @@ pub mod tests {
|
|||
drop(blocktree);
|
||||
Blocktree::destroy(&blocktree_path).expect("Expected successful database destruction");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_completed_data_ranges() {
|
||||
let completed_data_end_indexes = vec![2, 4, 9, 11];
|
||||
|
||||
// Consumed is 1, which means we're missing shred with index 1, should return empty
|
||||
let start_index = 0;
|
||||
let consumed = 1;
|
||||
assert_eq!(
|
||||
Blocktree::get_completed_data_ranges(
|
||||
start_index,
|
||||
&completed_data_end_indexes[..],
|
||||
consumed
|
||||
),
|
||||
vec![]
|
||||
);
|
||||
|
||||
let start_index = 0;
|
||||
let consumed = 3;
|
||||
assert_eq!(
|
||||
Blocktree::get_completed_data_ranges(
|
||||
start_index,
|
||||
&completed_data_end_indexes[..],
|
||||
consumed
|
||||
),
|
||||
vec![(0, 2)]
|
||||
);
|
||||
|
||||
// Test all possible ranges:
|
||||
//
|
||||
// `consumed == completed_data_end_indexes[j] + 1`, means we have all the shreds up to index
|
||||
// `completed_data_end_indexes[j] + 1`. Thus the completed data blocks is everything in the
|
||||
// range:
|
||||
// [start_index, completed_data_end_indexes[j]] ==
|
||||
// [completed_data_end_indexes[i], completed_data_end_indexes[j]],
|
||||
for i in 0..completed_data_end_indexes.len() {
|
||||
for j in i..completed_data_end_indexes.len() {
|
||||
let start_index = completed_data_end_indexes[i];
|
||||
let consumed = completed_data_end_indexes[j] + 1;
|
||||
// When start_index == completed_data_end_indexes[i], then that means
|
||||
// the shred with index == start_index is a single-shred data block,
|
||||
// so the start index is the end index for that data block.
|
||||
let mut expected = vec![(start_index, start_index)];
|
||||
expected.extend(
|
||||
completed_data_end_indexes[i..=j]
|
||||
.windows(2)
|
||||
.map(|end_indexes| (end_indexes[0] + 1, end_indexes[1])),
|
||||
);
|
||||
|
||||
assert_eq!(
|
||||
Blocktree::get_completed_data_ranges(
|
||||
start_index,
|
||||
&completed_data_end_indexes[..],
|
||||
consumed
|
||||
),
|
||||
expected
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_get_slot_entries_with_shred_count_corruption() {
|
||||
let blocktree_path =
|
||||
get_tmp_ledger_path("test_get_slot_entries_with_shred_count_corruption");
|
||||
{
|
||||
let blocktree = Blocktree::open(&blocktree_path).unwrap();
|
||||
let num_ticks = 8;
|
||||
let entries = create_ticks(num_ticks, Hash::default());
|
||||
let slot = 1;
|
||||
let shreds = entries_to_test_shreds(entries, slot, 0, false);
|
||||
let next_shred_index = shreds.len();
|
||||
blocktree
|
||||
.insert_shreds(shreds, None)
|
||||
.expect("Expected successful write of shreds");
|
||||
assert_eq!(
|
||||
blocktree.get_slot_entries(slot, 0, None).unwrap().len() as u64,
|
||||
num_ticks
|
||||
);
|
||||
|
||||
// Insert an empty shred that won't deshred into entries
|
||||
let shreds = vec![Shred::new_from_data(
|
||||
slot,
|
||||
next_shred_index as u32,
|
||||
1,
|
||||
None,
|
||||
true,
|
||||
true,
|
||||
)];
|
||||
|
||||
// With the corruption, nothing should be returned, even though an
|
||||
// earlier data block was valid
|
||||
blocktree
|
||||
.insert_shreds(shreds, None)
|
||||
.expect("Expected successful write of shreds");
|
||||
assert!(blocktree.get_slot_entries(slot, 0, None).is_err());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -27,6 +27,8 @@ pub struct SlotMeta {
|
|||
// True if this slot is full (consumed == last_index + 1) and if every
|
||||
// slot that is a parent of this slot is also connected.
|
||||
pub is_connected: bool,
|
||||
// List of start indexes for completed data slots
|
||||
pub completed_data_indexes: Vec<u32>,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug, Default, Deserialize, Serialize, Eq, PartialEq)]
|
||||
|
@ -227,6 +229,7 @@ impl SlotMeta {
|
|||
next_slots: vec![],
|
||||
is_connected: slot == 0,
|
||||
last_index: std::u64::MAX,
|
||||
completed_data_indexes: vec![],
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue