Add validator-identity argument to support monitoring a specific validator only

This commit is contained in:
Michael Vines 2019-12-16 10:06:08 -07:00
parent 862e7a410d
commit 844dddfee0
2 changed files with 55 additions and 8 deletions

View File

@ -4,6 +4,10 @@ count is advancing, new blockhashes are available, and no validators are
delinquent. Results are reported as InfluxDB metrics, with an optional delinquent. Results are reported as InfluxDB metrics, with an optional
Slack/Discord push notification on sanity failure. Slack/Discord push notification on sanity failure.
If you only care about the health of one specific validator, the
`--validator-identity` command-line argument can be used to restrict failure
notifications to issues only affecting that validator.
### Metrics ### Metrics
#### `watchtower-sanity` #### `watchtower-sanity`
On every iteration this data point will be emitted indicating the overall result On every iteration this data point will be emitted indicating the overall result

View File

@ -5,7 +5,10 @@ mod notifier;
use crate::notifier::Notifier; use crate::notifier::Notifier;
use clap::{crate_description, crate_name, value_t_or_exit, App, Arg}; use clap::{crate_description, crate_name, value_t_or_exit, App, Arg};
use log::*; use log::*;
use solana_clap_utils::input_validators::is_url; use solana_clap_utils::{
input_parsers::pubkey_of,
input_validators::{is_pubkey_or_keypair, is_url},
};
use solana_client::rpc_client::RpcClient; use solana_client::rpc_client::RpcClient;
use solana_metrics::{datapoint_error, datapoint_info}; use solana_metrics::{datapoint_error, datapoint_info};
use std::{error, io, thread::sleep, time::Duration}; use std::{error, io, thread::sleep, time::Duration};
@ -31,10 +34,19 @@ fn main() -> Result<(), Box<dyn error::Error>> {
.default_value("60") .default_value("60")
.help("Wait interval seconds between checking the cluster"), .help("Wait interval seconds between checking the cluster"),
) )
.arg(
Arg::with_name("validator_identity")
.long("validator-identity")
.value_name("VALIDATOR IDENTITY PUBKEY")
.takes_value(true)
.validator(is_pubkey_or_keypair)
.help("Monitor a specific validator only instead of the entire cluster"),
)
.get_matches(); .get_matches();
let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64)); let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String); let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String);
let validator_identity = pubkey_of(&matches, "validator_identity").map(|i| i.to_string());
solana_logger::setup_with_filter("solana=info"); solana_logger::setup_with_filter("solana=info");
solana_metrics::set_panic_hook("watchtower"); solana_metrics::set_panic_hook("watchtower");
@ -96,13 +108,44 @@ fn main() -> Result<(), Box<dyn error::Error>> {
"Delinquent validator count: {}", "Delinquent validator count: {}",
vote_accounts.delinquent.len() vote_accounts.delinquent.len()
); );
if vote_accounts.delinquent.is_empty() {
Ok(true) match validator_identity.as_ref() {
} else { Some(validator_identity) => {
Err(io::Error::new( if vote_accounts
io::ErrorKind::Other, .current
format!("{} delinquent validators", vote_accounts.delinquent.len()), .iter()
)) .any(|vai| vai.node_pubkey == *validator_identity)
{
Ok(true)
} else if vote_accounts
.delinquent
.iter()
.any(|vai| vai.node_pubkey == *validator_identity)
{
Err(io::Error::new(
io::ErrorKind::Other,
format!("Validator {} is delinquent", validator_identity),
))
} else {
Err(io::Error::new(
io::ErrorKind::Other,
format!("Validator {} is missing", validator_identity),
))
}
}
None => {
if vote_accounts.delinquent.is_empty() {
Ok(true)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
format!(
"{} delinquent validators",
vote_accounts.delinquent.len()
),
))
}
}
} }
}) })
.unwrap_or_else(|err| { .unwrap_or_else(|err| {