Refactor blockstore recovery code (#10008)

This commit is contained in:
sakridge 2020-05-13 10:09:38 -07:00 committed by GitHub
parent 1e80044e93
commit 9575afc8fa
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 174 additions and 86 deletions

View File

@ -547,49 +547,18 @@ impl Blockstore {
Ok(slot_iterator.map(move |(rooted_slot, _)| rooted_slot)) Ok(slot_iterator.map(move |(rooted_slot, _)| rooted_slot))
} }
fn try_shred_recovery( fn get_recovery_data_shreds(
db: &Database, index: &mut Index,
erasure_metas: &HashMap<(u64, u64), ErasureMeta>, set_index: u64,
index_working_set: &mut HashMap<u64, IndexMetaWorkingSetEntry>, slot: Slot,
erasure_meta: &ErasureMeta,
available_shreds: &mut Vec<Shred>,
prev_inserted_datas: &mut HashMap<(u64, u64), Shred>, prev_inserted_datas: &mut HashMap<(u64, u64), Shred>,
prev_inserted_codes: &mut HashMap<(u64, u64), Shred>, data_cf: &LedgerColumn<cf::ShredData>,
) -> Vec<Shred> { ) {
let data_cf = db.column::<cf::ShredData>();
let code_cf = db.column::<cf::ShredCode>();
let mut recovered_data_shreds = vec![];
// Recovery rules:
// 1. Only try recovery around indexes for which new data or coding shreds are received
// 2. For new data shreds, check if an erasure set exists. If not, don't try recovery
// 3. Before trying recovery, check if enough number of shreds have been received
// 3a. Enough number of shreds = (#data + #coding shreds) > erasure.num_data
for (&(slot, set_index), erasure_meta) in erasure_metas.iter() {
let submit_metrics = |attempted: bool, status: String, recovered: usize| {
datapoint_debug!(
"blockstore-erasure",
("slot", slot as i64, i64),
("start_index", set_index as i64, i64),
(
"end_index",
(erasure_meta.set_index + erasure_meta.config.num_data() as u64) as i64,
i64
),
("recovery_attempted", attempted, bool),
("recovery_status", status, String),
("recovered", recovered as i64, i64),
);
};
let index_meta_entry = index_working_set.get_mut(&slot).expect("Index");
let index = &mut index_meta_entry.index;
match erasure_meta.status(&index) {
ErasureMetaStatus::CanRecover => {
// Find shreds for this erasure set and try recovery
let slot = index.slot;
let mut available_shreds = vec![];
(set_index..set_index + erasure_meta.config.num_data() as u64).for_each(|i| { (set_index..set_index + erasure_meta.config.num_data() as u64).for_each(|i| {
if index.data().is_present(i) { if index.data().is_present(i) {
if let Some(shred) = if let Some(shred) = prev_inserted_datas.remove(&(slot, i)).or_else(|| {
prev_inserted_datas.remove(&(slot, i)).or_else(|| {
let some_data = data_cf let some_data = data_cf
.get_bytes((slot, i)) .get_bytes((slot, i))
.expect("Database failure, could not fetch data shred"); .expect("Database failure, could not fetch data shred");
@ -599,15 +568,23 @@ impl Blockstore {
warn!("Data shred deleted while reading for recovery"); warn!("Data shred deleted while reading for recovery");
None None
} }
}) }) {
{
available_shreds.push(shred); available_shreds.push(shred);
} }
} }
}); });
}
fn get_recovery_coding_shreds(
index: &mut Index,
slot: Slot,
erasure_meta: &ErasureMeta,
available_shreds: &mut Vec<Shred>,
prev_inserted_codes: &mut HashMap<(u64, u64), Shred>,
code_cf: &LedgerColumn<cf::ShredCode>,
) {
(erasure_meta.first_coding_index (erasure_meta.first_coding_index
..erasure_meta.first_coding_index ..erasure_meta.first_coding_index + erasure_meta.config.num_coding() as u64)
+ erasure_meta.config.num_coding() as u64)
.for_each(|i| { .for_each(|i| {
if let Some(shred) = prev_inserted_codes if let Some(shred) = prev_inserted_codes
.remove(&(slot, i)) .remove(&(slot, i))
@ -638,6 +615,41 @@ impl Blockstore {
available_shreds.push(shred); available_shreds.push(shred);
} }
}); });
}
fn recover_shreds(
index: &mut Index,
set_index: u64,
erasure_meta: &ErasureMeta,
prev_inserted_datas: &mut HashMap<(u64, u64), Shred>,
prev_inserted_codes: &mut HashMap<(u64, u64), Shred>,
recovered_data_shreds: &mut Vec<Shred>,
data_cf: &LedgerColumn<cf::ShredData>,
code_cf: &LedgerColumn<cf::ShredCode>,
) {
// Find shreds for this erasure set and try recovery
let slot = index.slot;
let mut available_shreds = vec![];
Self::get_recovery_data_shreds(
index,
set_index,
slot,
erasure_meta,
&mut available_shreds,
prev_inserted_datas,
data_cf,
);
Self::get_recovery_coding_shreds(
index,
slot,
erasure_meta,
&mut available_shreds,
prev_inserted_codes,
code_cf,
);
if let Ok(mut result) = Shredder::try_recovery( if let Ok(mut result) = Shredder::try_recovery(
available_shreds, available_shreds,
erasure_meta.config.num_data(), erasure_meta.config.num_data(),
@ -646,12 +658,74 @@ impl Blockstore {
erasure_meta.first_coding_index as usize, erasure_meta.first_coding_index as usize,
slot, slot,
) { ) {
submit_metrics(true, "complete".into(), result.len()); Self::submit_metrics(
slot,
set_index,
erasure_meta,
true,
"complete".into(),
result.len(),
);
recovered_data_shreds.append(&mut result); recovered_data_shreds.append(&mut result);
} else { } else {
submit_metrics(true, "incomplete".into(), 0); Self::submit_metrics(slot, set_index, erasure_meta, true, "incomplete".into(), 0);
} }
} }
fn submit_metrics(
slot: Slot,
set_index: u64,
erasure_meta: &ErasureMeta,
attempted: bool,
status: String,
recovered: usize,
) {
datapoint_debug!(
"blockstore-erasure",
("slot", slot as i64, i64),
("start_index", set_index as i64, i64),
(
"end_index",
(erasure_meta.set_index + erasure_meta.config.num_data() as u64) as i64,
i64
),
("recovery_attempted", attempted, bool),
("recovery_status", status, String),
("recovered", recovered as i64, i64),
);
}
fn try_shred_recovery(
db: &Database,
erasure_metas: &HashMap<(u64, u64), ErasureMeta>,
index_working_set: &mut HashMap<u64, IndexMetaWorkingSetEntry>,
prev_inserted_datas: &mut HashMap<(u64, u64), Shred>,
prev_inserted_codes: &mut HashMap<(u64, u64), Shred>,
) -> Vec<Shred> {
let data_cf = db.column::<cf::ShredData>();
let code_cf = db.column::<cf::ShredCode>();
let mut recovered_data_shreds = vec![];
// Recovery rules:
// 1. Only try recovery around indexes for which new data or coding shreds are received
// 2. For new data shreds, check if an erasure set exists. If not, don't try recovery
// 3. Before trying recovery, check if enough number of shreds have been received
// 3a. Enough number of shreds = (#data + #coding shreds) > erasure.num_data
for (&(slot, set_index), erasure_meta) in erasure_metas.iter() {
let index_meta_entry = index_working_set.get_mut(&slot).expect("Index");
let index = &mut index_meta_entry.index;
match erasure_meta.status(&index) {
ErasureMetaStatus::CanRecover => {
Self::recover_shreds(
index,
set_index,
erasure_meta,
prev_inserted_datas,
prev_inserted_codes,
&mut recovered_data_shreds,
&data_cf,
&code_cf,
);
}
ErasureMetaStatus::DataFull => { ErasureMetaStatus::DataFull => {
(set_index..set_index + erasure_meta.config.num_coding() as u64).for_each( (set_index..set_index + erasure_meta.config.num_coding() as u64).for_each(
|i| { |i| {
@ -665,10 +739,24 @@ impl Blockstore {
} }
}, },
); );
submit_metrics(false, "complete".into(), 0); Self::submit_metrics(
slot,
set_index,
erasure_meta,
false,
"complete".into(),
0,
);
} }
ErasureMetaStatus::StillNeed(needed) => { ErasureMetaStatus::StillNeed(needed) => {
submit_metrics(false, format!("still need: {}", needed), 0); Self::submit_metrics(
slot,
set_index,
erasure_meta,
false,
format!("still need: {}", needed),
0,
);
} }
}; };
} }