From 66b781eec3f3b3917830862dcb7c1bb7a8f86ed6 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Thu, 4 Mar 2021 21:18:08 -0800 Subject: [PATCH] Add 'unknown' health check state --- core/src/rpc.rs | 10 +++ core/src/rpc_health.rs | 94 +++++++++++----------- core/src/rpc_service.rs | 19 +++-- docs/src/developing/clients/jsonrpc-api.md | 7 +- validator/src/dashboard.rs | 2 +- 5 files changed, 74 insertions(+), 58 deletions(-) diff --git a/core/src/rpc.rs b/core/src/rpc.rs index 3bd9d7a5e9..13b0442e50 100644 --- a/core/src/rpc.rs +++ b/core/src/rpc.rs @@ -1866,6 +1866,10 @@ pub mod rpc_minimal { fn get_health(&self, meta: Self::Metadata) -> Result { match meta.health.check() { RpcHealthStatus::Ok => Ok("ok".to_string()), + RpcHealthStatus::Unknown => Err(RpcCustomError::NodeUnhealthy { + num_slots_behind: None, + } + .into()), RpcHealthStatus::Behind { num_slots } => Err(RpcCustomError::NodeUnhealthy { num_slots_behind: Some(num_slots), } @@ -2700,6 +2704,12 @@ pub mod rpc_full { match meta.health.check() { RpcHealthStatus::Ok => (), + RpcHealthStatus::Unknown => { + return Err(RpcCustomError::NodeUnhealthy { + num_slots_behind: None, + } + .into()); + } RpcHealthStatus::Behind { num_slots } => { return Err(RpcCustomError::NodeUnhealthy { num_slots_behind: Some(num_slots), diff --git a/core/src/rpc_health.rs b/core/src/rpc_health.rs index 51e35fca16..337cfd14e1 100644 --- a/core/src/rpc_health.rs +++ b/core/src/rpc_health.rs @@ -8,10 +8,11 @@ use { }, }; -#[derive(PartialEq, Clone, Copy)] +#[derive(PartialEq, Clone, Copy, Debug)] pub enum RpcHealthStatus { Ok, Behind { num_slots: Slot }, // Validator is behind its trusted validators + Unknown, } pub struct RpcHealth { @@ -51,52 +52,53 @@ impl RpcHealth { if self.override_health_check.load(Ordering::Relaxed) { RpcHealthStatus::Ok } else if let Some(trusted_validators) = &self.trusted_validators { - let (latest_account_hash_slot, latest_trusted_validator_account_hash_slot) = { + match ( + self.cluster_info + .get_accounts_hash_for_node(&self.cluster_info.id(), |hashes| { + hashes + .iter() + .max_by(|a, b| a.0.cmp(&b.0)) + .map(|slot_hash| slot_hash.0) + }) + .flatten(), + trusted_validators + .iter() + .filter_map(|trusted_validator| { + self.cluster_info + .get_accounts_hash_for_node(&trusted_validator, |hashes| { + hashes + .iter() + .max_by(|a, b| a.0.cmp(&b.0)) + .map(|slot_hash| slot_hash.0) + }) + .flatten() + }) + .max(), + ) { ( - self.cluster_info - .get_accounts_hash_for_node(&self.cluster_info.id(), |hashes| { - hashes - .iter() - .max_by(|a, b| a.0.cmp(&b.0)) - .map(|slot_hash| slot_hash.0) - }) - .flatten() - .unwrap_or(0), - trusted_validators - .iter() - .map(|trusted_validator| { - self.cluster_info - .get_accounts_hash_for_node(&trusted_validator, |hashes| { - hashes - .iter() - .max_by(|a, b| a.0.cmp(&b.0)) - .map(|slot_hash| slot_hash.0) - }) - .flatten() - .unwrap_or(0) - }) - .max() - .unwrap_or(0), - ) - }; - - // This validator is considered healthy if its latest account hash slot is within - // `health_check_slot_distance` of the latest trusted validator's account hash slot - if latest_account_hash_slot > 0 - && latest_trusted_validator_account_hash_slot > 0 - && latest_account_hash_slot - > latest_trusted_validator_account_hash_slot - .saturating_sub(self.health_check_slot_distance) - { - RpcHealthStatus::Ok - } else { - let num_slots = latest_trusted_validator_account_hash_slot - .saturating_sub(latest_account_hash_slot); - warn!( - "health check: behind by {} slots: me={}, latest trusted_validator={}", - num_slots, latest_account_hash_slot, latest_trusted_validator_account_hash_slot - ); - RpcHealthStatus::Behind { num_slots } + Some(latest_account_hash_slot), + Some(latest_trusted_validator_account_hash_slot), + ) => { + // The validator is considered healthy if its latest account hash slot is within + // `health_check_slot_distance` of the latest trusted validator's account hash slot + if latest_account_hash_slot + > latest_trusted_validator_account_hash_slot + .saturating_sub(self.health_check_slot_distance) + { + RpcHealthStatus::Ok + } else { + let num_slots = latest_trusted_validator_account_hash_slot + .saturating_sub(latest_account_hash_slot); + warn!( + "health check: behind by {} slots: me={}, latest trusted_validator={}", + num_slots, + latest_account_hash_slot, + latest_trusted_validator_account_hash_slot + ); + RpcHealthStatus::Behind { num_slots } + } + } + _ => RpcHealthStatus::Unknown, } } else { // No trusted validator point of reference available, so this validator is healthy diff --git a/core/src/rpc_service.rs b/core/src/rpc_service.rs index 870127a32b..86d0302933 100644 --- a/core/src/rpc_service.rs +++ b/core/src/rpc_service.rs @@ -178,7 +178,8 @@ impl RpcRequestMiddleware { fn health_check(&self) -> &'static str { let response = match self.health.check() { RpcHealthStatus::Ok => "ok", - RpcHealthStatus::Behind { num_slots: _ } => "behind", + RpcHealthStatus::Behind { .. } => "behind", + RpcHealthStatus::Unknown => "unknown", }; info!("health check: {}", response); response @@ -696,18 +697,20 @@ mod tests { let rm = RpcRequestMiddleware::new(PathBuf::from("/"), None, create_bank_forks(), health); - // No account hashes for this node or any trusted validators == "behind" - assert_eq!(rm.health_check(), "behind"); + // No account hashes for this node or any trusted validators + assert_eq!(rm.health_check(), "unknown"); - // No account hashes for any trusted validators == "behind" + // No account hashes for any trusted validators cluster_info.push_accounts_hashes(vec![(1000, Hash::default()), (900, Hash::default())]); cluster_info.flush_push_queue(); - assert_eq!(rm.health_check(), "behind"); + assert_eq!(rm.health_check(), "unknown"); + + // Override health check override_health_check.store(true, Ordering::Relaxed); assert_eq!(rm.health_check(), "ok"); override_health_check.store(false, Ordering::Relaxed); - // This node is ahead of the trusted validators == "ok" + // This node is ahead of the trusted validators cluster_info .gossip .write() @@ -727,7 +730,7 @@ mod tests { .unwrap(); assert_eq!(rm.health_check(), "ok"); - // Node is slightly behind the trusted validators == "ok" + // Node is slightly behind the trusted validators cluster_info .gossip .write() @@ -743,7 +746,7 @@ mod tests { .unwrap(); assert_eq!(rm.health_check(), "ok"); - // Node is far behind the trusted validators == "behind" + // Node is far behind the trusted validators cluster_info .gossip .write() diff --git a/docs/src/developing/clients/jsonrpc-api.md b/docs/src/developing/clients/jsonrpc-api.md index 21a8224213..6dbf1f488e 100644 --- a/docs/src/developing/clients/jsonrpc-api.md +++ b/docs/src/developing/clients/jsonrpc-api.md @@ -187,11 +187,12 @@ Many methods that take a commitment parameter return an RpcResponse JSON object Although not a JSON RPC API, a `GET /health` at the RPC HTTP Endpoint provides a health-check mechanism for use by load balancers or other network infrastructure. This request will always return a HTTP 200 OK response with a body of -"ok" or "behind" based on the following conditions: +"ok", "behind" or "unknown" based on the following conditions: 1. If one or more `--trusted-validator` arguments are provided to `solana-validator`, "ok" is returned - when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest trusted validator, - otherwise "behind" is returned. + when the node has within `HEALTH_CHECK_SLOT_DISTANCE` slots of the highest + trusted validator, otherwise "behind". "unknown" is returned when no slot + information from trusted validators is not yet available. 2. "ok" is always returned if no trusted validators are provided. ## JSON RPC API Reference diff --git a/validator/src/dashboard.rs b/validator/src/dashboard.rs index 2858cb9f80..f6f9ee3d53 100644 --- a/validator/src/dashboard.rs +++ b/validator/src/dashboard.rs @@ -266,7 +266,7 @@ fn get_validator_stats( { format!("{} slots behind", num_slots_behind) } else { - "unhealthy".to_string() + "health unknown".to_string() } } };