Enforce a 12MB limit on outbound repair (#26493)

This commit is contained in:
carllin 2022-07-24 20:44:22 -05:00 committed by GitHub
parent a0e160b5aa
commit f6d5b253fb
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 51 additions and 16 deletions

View File

@ -579,7 +579,8 @@ impl BankingStage {
};
const INTERVAL_MS: u64 = 100;
const MAX_BYTES_PER_SECOND: usize = 10_000 * 1200;
// 12 MB outbound limit per second
const MAX_BYTES_PER_SECOND: usize = 12_000_000;
const MAX_BYTES_PER_INTERVAL: usize = MAX_BYTES_PER_SECOND * INTERVAL_MS as usize / 1000;
const MAX_BYTES_BUDGET: usize = MAX_BYTES_PER_INTERVAL * 5;
data_budget.update(INTERVAL_MS, |bytes| {

View File

@ -2,7 +2,6 @@ use {
crate::{
cluster_slots::ClusterSlots,
duplicate_repair_status::ANCESTOR_HASH_REPAIR_SAMPLE_SIZE,
packet_threshold::DynamicPacketToProcessThreshold,
repair_response,
repair_service::{OutstandingShredRepairs, RepairStats},
request_response::RequestResponse,
@ -25,7 +24,10 @@ use {
shred::{Nonce, Shred, SIZE_OF_NONCE},
},
solana_metrics::inc_new_counter_debug,
solana_perf::packet::{PacketBatch, PacketBatchRecycler},
solana_perf::{
data_budget::DataBudget,
packet::{PacketBatch, PacketBatchRecycler},
},
solana_sdk::{
clock::Slot, hash::Hash, packet::PACKET_DATA_SIZE, pubkey::Pubkey, timing::duration_as_ms,
},
@ -144,7 +146,9 @@ impl RequestResponse for AncestorHashesRepairType {
pub struct ServeRepairStats {
pub total_requests: usize,
pub dropped_requests: usize,
pub total_dropped_response_packets: usize,
pub total_response_packets: usize,
pub total_response_bytes: usize,
pub processed: usize,
pub self_repair: usize,
pub window_index: usize,
@ -323,17 +327,18 @@ impl ServeRepair {
requests_receiver: &PacketBatchReceiver,
response_sender: &PacketBatchSender,
stats: &mut ServeRepairStats,
packet_threshold: &mut DynamicPacketToProcessThreshold,
data_budget: &DataBudget,
) -> Result<()> {
//TODO cache connections
let timeout = Duration::new(1, 0);
let mut reqs_v = vec![requests_receiver.recv_timeout(timeout)?];
const MAX_REQUESTS_PER_ITERATION: usize = 1024;
let mut total_requests = reqs_v[0].len();
let mut dropped_requests = 0;
while let Ok(more) = requests_receiver.try_recv() {
total_requests += more.len();
if packet_threshold.should_drop(total_requests) {
if total_requests > MAX_REQUESTS_PER_ITERATION {
dropped_requests += more.len();
} else {
reqs_v.push(more);
@ -343,11 +348,17 @@ impl ServeRepair {
stats.dropped_requests += dropped_requests;
stats.total_requests += total_requests;
let timer = Instant::now();
for reqs in reqs_v {
Self::handle_packets(obj, recycler, blockstore, reqs, response_sender, stats);
Self::handle_packets(
obj,
recycler,
blockstore,
reqs,
response_sender,
stats,
data_budget,
);
}
packet_threshold.update(total_requests, timer.elapsed());
Ok(())
}
@ -365,7 +376,13 @@ impl ServeRepair {
"serve_repair-requests_received",
("total_requests", stats.total_requests, i64),
("dropped_requests", stats.dropped_requests, i64),
(
"total_dropped_response_packets",
stats.total_dropped_response_packets,
i64
),
("total_response_packets", stats.total_response_packets, i64),
("total_response_bytes", stats.total_response_bytes, i64),
("self_repair", stats.self_repair, i64),
("window_index", stats.window_index, i64),
(
@ -391,6 +408,10 @@ impl ServeRepair {
response_sender: PacketBatchSender,
exit: &Arc<AtomicBool>,
) -> JoinHandle<()> {
const INTERVAL_MS: u64 = 1000;
const MAX_BYTES_PER_SECOND: usize = 12_000_000;
const MAX_BYTES_PER_INTERVAL: usize = MAX_BYTES_PER_SECOND * INTERVAL_MS as usize / 1000;
let exit = exit.clone();
let recycler = PacketBatchRecycler::default();
Builder::new()
@ -398,7 +419,7 @@ impl ServeRepair {
.spawn(move || {
let mut last_print = Instant::now();
let mut stats = ServeRepairStats::default();
let mut packet_threshold = DynamicPacketToProcessThreshold::default();
let data_budget = DataBudget::default();
loop {
let result = Self::run_listen(
&me,
@ -407,7 +428,7 @@ impl ServeRepair {
&requests_receiver,
&response_sender,
&mut stats,
&mut packet_threshold,
&data_budget,
);
match result {
Err(Error::RecvTimeout(_)) | Ok(_) => {}
@ -420,6 +441,7 @@ impl ServeRepair {
Self::report_reset_stats(&me, &mut stats);
last_print = Instant::now();
}
data_budget.update(INTERVAL_MS, |_bytes| MAX_BYTES_PER_INTERVAL);
}
})
.unwrap()
@ -432,19 +454,31 @@ impl ServeRepair {
packet_batch: PacketBatch,
response_sender: &PacketBatchSender,
stats: &mut ServeRepairStats,
data_budget: &DataBudget,
) {
// iter over the packets
packet_batch.iter().for_each(|packet| {
for (i, packet) in packet_batch.iter().enumerate() {
if let Ok(request) = packet.deserialize_slice(..) {
stats.processed += 1;
let from_addr = packet.meta.socket_addr();
let rsp = Self::handle_repair(me, recycler, &from_addr, blockstore, request, stats);
stats.total_response_packets += rsp.as_ref().map(PacketBatch::len).unwrap_or(0);
if let Some(rsp) = rsp {
let _ignore_disconnect = response_sender.send(rsp);
let rsp =
match Self::handle_repair(me, recycler, &from_addr, blockstore, request, stats)
{
None => continue,
Some(rsp) => rsp,
};
let num_response_packets = rsp.len();
let num_response_bytes = rsp.iter().map(|p| p.meta.size).sum();
if data_budget.take(num_response_bytes) && response_sender.send(rsp).is_ok() {
stats.total_response_bytes += num_response_bytes;
stats.total_response_packets += num_response_packets;
} else {
stats.dropped_requests += packet_batch.len() - i;
stats.total_dropped_response_packets += num_response_packets;
break;
}
}
});
}
}
fn window_index_request_bytes(