Skip gossip requests with different shred version and split lock (#10240)

This commit is contained in:
sakridge 2020-05-28 11:38:13 -07:00 committed by GitHub
parent 9227874ada
commit 3f508b37fd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 271 additions and 60 deletions

View File

@ -218,7 +218,10 @@ struct GossipStats {
process_pull_response_count: Counter,
process_pull_response_len: Counter,
process_pull_response_timeout: Counter,
process_pull_response_fail: Counter,
process_pull_response_success: Counter,
process_pull_requests: Counter,
generate_pull_responses: Counter,
process_prune: Counter,
process_push_message: Counter,
prune_received_cache: Counter,
@ -228,6 +231,11 @@ struct GossipStats {
push_message: Counter,
new_pull_requests: Counter,
mark_pull_request: Counter,
skip_pull_response_shred_version: Counter,
skip_pull_shred_version: Counter,
skip_push_message_shred_version: Counter,
push_message_count: Counter,
push_message_value_count: Counter,
}
pub struct ClusterInfo {
@ -1526,12 +1534,17 @@ impl ClusterInfo {
if contact_info.id == me.id() {
warn!("PullRequest ignored, I'm talking to myself");
inc_new_counter_debug!("cluster_info-window-request-loopback", 1);
} else {
} else if contact_info.shred_version == 0
|| contact_info.shred_version == me.my_shred_version()
|| me.my_shred_version() == 0
{
gossip_pull_data.push(PullData {
from_addr,
caller,
filter,
});
} else {
me.stats.skip_pull_shred_version.add_relaxed(1);
}
}
datapoint_debug!(
@ -1620,6 +1633,26 @@ impl ClusterInfo {
}
}
fn update_data_budget(&self, stakes: &HashMap<Pubkey, u64>) {
let mut w_outbound_budget = self.outbound_budget.write().unwrap();
let now = timestamp();
const INTERVAL_MS: u64 = 100;
// allow 50kBps per staked validator, epoch slots + votes ~= 1.5kB/slot ~= 4kB/s
const BYTES_PER_INTERVAL: usize = 5000;
const MAX_BUDGET_MULTIPLE: usize = 5; // allow budget build-up to 5x the interval default
if now - w_outbound_budget.last_timestamp_ms > INTERVAL_MS {
let len = std::cmp::max(stakes.len(), 2);
w_outbound_budget.bytes += len * BYTES_PER_INTERVAL;
w_outbound_budget.bytes = std::cmp::min(
w_outbound_budget.bytes,
MAX_BUDGET_MULTIPLE * len * BYTES_PER_INTERVAL,
);
w_outbound_budget.last_timestamp_ms = now;
}
}
// Pull requests take an incoming bloom filter of contained entries from a node
// and tries to send back to them the values it detects are missing.
fn handle_pull_requests(
@ -1632,33 +1665,19 @@ impl ClusterInfo {
let mut caller_and_filters = vec![];
let mut addrs = vec![];
let mut time = Measure::start("handle_pull_requests");
{
let mut w_outbound_budget = me.outbound_budget.write().unwrap();
let now = timestamp();
const INTERVAL_MS: u64 = 100;
// allow 50kBps per staked validator, epoch slots + votes ~= 1.5kB/slot ~= 4kB/s
const BYTES_PER_INTERVAL: usize = 5000;
const MAX_BUDGET_MULTIPLE: usize = 5; // allow budget build-up to 5x the interval default
if now - w_outbound_budget.last_timestamp_ms > INTERVAL_MS {
let len = std::cmp::max(stakes.len(), 2);
w_outbound_budget.bytes += len * BYTES_PER_INTERVAL;
w_outbound_budget.bytes = std::cmp::min(
w_outbound_budget.bytes,
MAX_BUDGET_MULTIPLE * len * BYTES_PER_INTERVAL,
);
w_outbound_budget.last_timestamp_ms = now;
}
}
me.update_data_budget(stakes);
for pull_data in requests {
caller_and_filters.push((pull_data.caller, pull_data.filter));
addrs.push(pull_data.from_addr);
}
let now = timestamp();
let self_id = me.id();
let pull_responses = me
.time_gossip_write_lock("process_pull_reqs", &me.stats.process_pull_requests)
.time_gossip_read_lock("generate_pull_responses", &me.stats.generate_pull_responses)
.generate_pull_responses(&caller_and_filters);
me.time_gossip_write_lock("process_pull_reqs", &me.stats.process_pull_requests)
.process_pull_requests(caller_and_filters, now);
// Filter bad to addresses
@ -1755,37 +1774,94 @@ impl ClusterInfo {
Some(packets)
}
// Returns (failed, timeout, success)
fn handle_pull_response(
me: &Self,
from: &Pubkey,
data: Vec<CrdsValue>,
mut crds_values: Vec<CrdsValue>,
timeouts: &HashMap<Pubkey, u64>,
) {
let len = data.len();
) -> (usize, usize, usize) {
let len = crds_values.len();
trace!("PullResponse me: {} from: {} len={}", me.id, from, len);
let (_fail, timeout_count) = me
if let Some(shred_version) = me.lookup_contact_info(from, |ci| ci.shred_version) {
Self::filter_by_shred_version(
from,
&mut crds_values,
shred_version,
me.my_shred_version(),
);
}
let filtered_len = crds_values.len();
let (fail, timeout_count, success) = me
.time_gossip_write_lock("process_pull", &me.stats.process_pull_response)
.process_pull_response(from, timeouts, data, timestamp());
.process_pull_response(from, timeouts, crds_values, timestamp());
me.stats
.skip_pull_response_shred_version
.add_relaxed((len - filtered_len) as u64);
me.stats.process_pull_response_count.add_relaxed(1);
me.stats.process_pull_response_len.add_relaxed(len as u64);
me.stats
.process_pull_response_len
.add_relaxed(filtered_len as u64);
me.stats
.process_pull_response_timeout
.add_relaxed(timeout_count as u64);
me.stats.process_pull_response_fail.add_relaxed(fail as u64);
me.stats
.process_pull_response_success
.add_relaxed(success as u64);
(fail, timeout_count, success)
}
fn filter_by_shred_version(
from: &Pubkey,
crds_values: &mut Vec<CrdsValue>,
shred_version: u16,
my_shred_version: u16,
) {
if my_shred_version != 0 && shred_version != 0 && shred_version != my_shred_version {
// Allow someone to update their own ContactInfo so they
// can change shred versions if needed.
crds_values.retain(|crds_value| match &crds_value.data {
CrdsData::ContactInfo(contact_info) => contact_info.id == *from,
_ => false,
});
}
}
fn handle_push_message(
me: &Self,
recycler: &PacketsRecycler,
from: &Pubkey,
data: Vec<CrdsValue>,
mut crds_values: Vec<CrdsValue>,
stakes: &HashMap<Pubkey, u64>,
) -> Option<Packets> {
let self_id = me.id();
inc_new_counter_debug!("cluster_info-push_message", 1);
me.stats.push_message_count.add_relaxed(1);
let len = crds_values.len();
if let Some(shred_version) = me.lookup_contact_info(from, |ci| ci.shred_version) {
Self::filter_by_shred_version(
from,
&mut crds_values,
shred_version,
me.my_shred_version(),
);
}
let filtered_len = crds_values.len();
me.stats
.push_message_value_count
.add_relaxed(filtered_len as u64);
me.stats
.skip_push_message_shred_version
.add_relaxed((len - filtered_len) as u64);
let updated: Vec<_> = me
.time_gossip_write_lock("process_push", &me.stats.process_push_message)
.process_push_message(from, data, timestamp());
.process_push_message(from, crds_values, timestamp());
let updated_labels: Vec<_> = updated.into_iter().map(|u| u.value.label()).collect();
let prunes_map: HashMap<Pubkey, HashSet<Pubkey>> = me
@ -1945,6 +2021,11 @@ impl ClusterInfo {
self.stats.process_pull_requests.clear(),
i64
),
(
"generate_pull_responses",
self.stats.generate_pull_responses.clear(),
i64
),
("process_prune", self.stats.process_prune.clear(), i64),
(
"process_push_message",
@ -1974,6 +2055,34 @@ impl ClusterInfo {
i64
),
);
datapoint_info!(
"cluster_info_shred_version_skip",
(
"skip_push_message_shred_version",
self.stats.skip_push_message_shred_version.clear(),
i64
),
(
"skip_pull_response_shred_version",
self.stats.skip_pull_response_shred_version.clear(),
i64
),
(
"skip_pull_shred_version",
self.stats.skip_pull_shred_version.clear(),
i64
),
(
"push_message_count",
self.stats.push_message_count.clear(),
i64
),
(
"push_message_value_count",
self.stats.push_message_value_count.clear(),
i64
),
);
*last_print = Instant::now();
}
@ -2289,6 +2398,91 @@ mod tests {
assert!(ClusterInfo::is_spy_node(&node));
}
#[test]
fn test_handle_pull() {
let node = Node::new_localhost();
let cluster_info = Arc::new(ClusterInfo::new_with_invalid_keypair(node.info));
let entrypoint_pubkey = Pubkey::new_rand();
let data = test_crds_values(entrypoint_pubkey);
let timeouts = HashMap::new();
assert_eq!(
(0, 0, 1),
ClusterInfo::handle_pull_response(
&cluster_info,
&entrypoint_pubkey,
data.clone(),
&timeouts
)
);
let entrypoint_pubkey2 = Pubkey::new_rand();
assert_eq!(
(1, 0, 0),
ClusterInfo::handle_pull_response(&cluster_info, &entrypoint_pubkey2, data, &timeouts)
);
}
fn test_crds_values(pubkey: Pubkey) -> Vec<CrdsValue> {
let entrypoint = ContactInfo::new_localhost(&pubkey, timestamp());
let entrypoint_crdsvalue =
CrdsValue::new_unsigned(CrdsData::ContactInfo(entrypoint.clone()));
vec![entrypoint_crdsvalue]
}
#[test]
fn test_filter_shred_version() {
let from = Pubkey::new_rand();
let my_shred_version = 1;
let other_shred_version = 1;
// Allow same shred_version
let mut values = test_crds_values(from);
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
// Allow shred_version=0.
let other_shred_version = 0;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
// Change to sender's ContactInfo version, allow that.
let other_shred_version = 2;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
let snapshot_hash_data = CrdsValue::new_unsigned(CrdsData::SnapshotHashes(SnapshotHash {
from: Pubkey::new_rand(),
hashes: vec![],
wallclock: 0,
}));
values.push(snapshot_hash_data);
// Change to sender's ContactInfo version, allow that.
let other_shred_version = 2;
ClusterInfo::filter_by_shred_version(
&from,
&mut values,
other_shred_version,
my_shred_version,
);
assert_eq!(values.len(), 1);
}
#[test]
fn test_cluster_spy_gossip() {
//check that gossip doesn't try to push to invalid addresses

View File

@ -158,14 +158,18 @@ impl CrdsGossip {
self.pull.mark_pull_request_creation_time(from, now)
}
/// process a pull request and create a response
pub fn process_pull_requests(
&mut self,
filters: Vec<(CrdsValue, CrdsFilter)>,
now: u64,
) -> Vec<Vec<CrdsValue>> {
pub fn process_pull_requests(&mut self, filters: Vec<(CrdsValue, CrdsFilter)>, now: u64) {
self.pull
.process_pull_requests(&mut self.crds, filters, now)
.process_pull_requests(&mut self.crds, filters, now);
}
pub fn generate_pull_responses(
&self,
filters: &[(CrdsValue, CrdsFilter)],
) -> Vec<Vec<CrdsValue>> {
self.pull.generate_pull_responses(&self.crds, filters)
}
/// process a pull response
pub fn process_pull_response(
&mut self,
@ -173,7 +177,7 @@ impl CrdsGossip {
timeouts: &HashMap<Pubkey, u64>,
response: Vec<CrdsValue>,
now: u64,
) -> (usize, usize) {
) -> (usize, usize, usize) {
self.pull
.process_pull_response(&mut self.crds, from, timeouts, response, now)
}

View File

@ -204,14 +204,13 @@ impl CrdsGossipPull {
self.purged_values.push_back((hash, timestamp))
}
/// process a pull request and create a response
/// process a pull request
pub fn process_pull_requests(
&mut self,
crds: &mut Crds,
requests: Vec<(CrdsValue, CrdsFilter)>,
now: u64,
) -> Vec<Vec<CrdsValue>> {
let rv = self.filter_crds_values(crds, &requests);
) {
requests.into_iter().for_each(|(caller, _)| {
let key = caller.label().pubkey();
let old = crds.insert(caller, now);
@ -221,8 +220,17 @@ impl CrdsGossipPull {
}
crds.update_record_timestamp(&key, now);
});
rv
}
/// Create gossip responses to pull requests
pub fn generate_pull_responses(
&self,
crds: &Crds,
requests: &[(CrdsValue, CrdsFilter)],
) -> Vec<Vec<CrdsValue>> {
self.filter_crds_values(crds, requests)
}
/// process a pull response
pub fn process_pull_response(
&mut self,
@ -231,9 +239,10 @@ impl CrdsGossipPull {
timeouts: &HashMap<Pubkey, u64>,
response: Vec<CrdsValue>,
now: u64,
) -> (usize, usize) {
) -> (usize, usize, usize) {
let mut failed = 0;
let mut timeout_count = 0;
let mut success = 0;
for r in response {
let owner = r.label().pubkey();
// Check if the crds value is older than the msg_timeout
@ -274,7 +283,11 @@ impl CrdsGossipPull {
}
}
let old = crds.insert(r, now);
failed += old.is_err() as usize;
if old.is_err() {
failed += 1;
} else {
success += 1;
}
old.ok().map(|opt| {
crds.update_record_timestamp(&owner, now);
opt.map(|val| {
@ -284,7 +297,7 @@ impl CrdsGossipPull {
});
}
crds.update_record_timestamp(from, now);
(failed, timeout_count)
(failed, timeout_count, success)
}
// build a set of filters of the current crds table
// num_filters - used to increase the likelyhood of a value in crds being added to some filter
@ -573,8 +586,9 @@ mod test {
let mut dest_crds = Crds::default();
let mut dest = CrdsGossipPull::default();
let (_, filters, caller) = req.unwrap();
let filters = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let rsp = dest.process_pull_requests(&mut dest_crds, filters, 1);
let filters: Vec<_> = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let rsp = dest.generate_pull_responses(&dest_crds, &filters);
dest.process_pull_requests(&mut dest_crds, filters, 1);
assert!(rsp.iter().all(|rsp| rsp.is_empty()));
assert!(dest_crds.lookup(&caller.label()).is_some());
assert_eq!(
@ -643,8 +657,9 @@ mod test {
PACKET_DATA_SIZE,
);
let (_, filters, caller) = req.unwrap();
let filters = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let mut rsp = dest.process_pull_requests(&mut dest_crds, filters, 0);
let filters: Vec<_> = filters.into_iter().map(|f| (caller.clone(), f)).collect();
let mut rsp = dest.generate_pull_responses(&dest_crds, &filters);
dest.process_pull_requests(&mut dest_crds, filters, 0);
// if there is a false positive this is empty
// prob should be around 0.1 per iteration
if rsp.is_empty() {

View File

@ -426,23 +426,21 @@ fn network_run_pull(
.map(|f| f.filter.bits.len() as usize / 8)
.sum::<usize>();
bytes += serialized_size(&caller_info).unwrap() as usize;
let filters = filters
let filters: Vec<_> = filters
.into_iter()
.map(|f| (caller_info.clone(), f))
.collect();
let rsp = network
let rsp: Vec<_> = network
.get(&to)
.map(|node| {
let mut rsp = vec![];
rsp.append(
&mut node
.lock()
.unwrap()
.process_pull_requests(filters, now)
.into_iter()
.flatten()
.collect(),
);
let rsp = node
.lock()
.unwrap()
.generate_pull_responses(&filters)
.into_iter()
.flatten()
.collect();
node.lock().unwrap().process_pull_requests(filters, now);
rsp
})
.unwrap();