solana/watchtower/src/main.rs

326 lines
13 KiB
Rust
Raw Normal View History

2019-12-11 16:05:10 -08:00
//! A command-line executable for monitoring the health of a cluster
use {
clap::{crate_description, crate_name, value_t, value_t_or_exit, App, Arg},
log::*,
solana_clap_utils::{
input_parsers::pubkeys_of,
input_validators::{is_pubkey_or_keypair, is_url},
},
solana_cli_output::display::format_labeled_address,
solana_client::{
client_error::Result as ClientResult, rpc_client::RpcClient,
rpc_response::RpcVoteAccountStatus,
},
solana_metrics::{datapoint_error, datapoint_info},
solana_notifier::Notifier,
solana_sdk::{hash::Hash, native_token::lamports_to_sol, pubkey::Pubkey},
std::{
collections::HashMap,
error,
str::FromStr,
thread::sleep,
time::{Duration, Instant},
},
};
2019-12-11 16:05:10 -08:00
2020-03-10 11:41:51 -07:00
struct Config {
interval: Duration,
json_rpc_url: String,
validator_identity_pubkeys: Vec<String>,
no_duplicate_notifications: bool,
monitor_active_stake: bool,
address_labels: HashMap<String, String>,
2020-03-09 22:02:40 -07:00
}
2020-03-10 11:41:51 -07:00
fn get_config() -> Config {
let matches = App::new(crate_name!())
2019-12-11 16:05:10 -08:00
.about(crate_description!())
.version(solana_version::version!())
.after_help("ADDITIONAL HELP:
To receive a Slack, Discord and/or Telegram notification on sanity failure,
define environment variables before running `solana-watchtower`:
export SLACK_WEBHOOK=...
export DISCORD_WEBHOOK=...
2020-04-17 00:23:17 -07:00
Telegram requires the following two variables:
export TELEGRAM_BOT_TOKEN=...
export TELEGRAM_CHAT_ID=...
2020-04-17 00:23:17 -07:00
To receive a Twilio SMS notification on failure, having a Twilio account,
and a sending number owned by that account,
define environment variable before running `solana-watchtower`:
export TWILIO_CONFIG='ACCOUNT=<account>,TOKEN=<securityToken>,TO=<receivingNumber>,FROM=<sendingNumber>'")
2020-03-09 12:35:32 -07:00
.arg({
let arg = Arg::with_name("config_file")
.short("C")
.long("config")
.value_name("PATH")
.takes_value(true)
.global(true)
.help("Configuration file to use");
2020-03-10 11:41:51 -07:00
if let Some(ref config_file) = *solana_cli_config::CONFIG_FILE {
2020-03-09 12:35:32 -07:00
arg.default_value(&config_file)
} else {
arg
}
})
2019-12-11 16:05:10 -08:00
.arg(
Arg::with_name("json_rpc_url")
.long("url")
.value_name("URL")
.takes_value(true)
.validator(is_url)
.help("JSON RPC URL for the cluster"),
)
.arg(
Arg::with_name("interval")
.long("interval")
.value_name("SECONDS")
.takes_value(true)
.default_value("60")
.help("Wait interval seconds between checking the cluster"),
)
.arg(
2020-03-10 11:25:40 -07:00
Arg::with_name("validator_identities")
.long("validator-identity")
.value_name("VALIDATOR IDENTITY PUBKEY")
.takes_value(true)
.validator(is_pubkey_or_keypair)
2020-03-09 12:55:31 -07:00
.multiple(true)
.help("Monitor a specific validator only instead of the entire cluster"),
)
.arg(
Arg::with_name("no_duplicate_notifications")
.long("no-duplicate-notifications")
.takes_value(false)
.help("Subsequent identical notifications will be suppressed"),
)
2020-03-10 11:25:40 -07:00
.arg(
Arg::with_name("monitor_active_stake")
.long("monitor-active-stake")
.takes_value(false)
.help("Alert when the current stake for the cluster drops below 80%"),
)
2020-03-10 11:41:51 -07:00
.get_matches();
2019-12-11 16:05:10 -08:00
2020-03-09 12:35:32 -07:00
let config = if let Some(config_file) = matches.value_of("config_file") {
2020-03-10 11:41:51 -07:00
solana_cli_config::Config::load(config_file).unwrap_or_default()
2020-03-09 12:35:32 -07:00
} else {
2020-03-10 11:41:51 -07:00
solana_cli_config::Config::default()
2020-03-09 12:35:32 -07:00
};
2019-12-11 16:05:10 -08:00
let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
2020-03-09 12:35:32 -07:00
let json_rpc_url =
value_t!(matches, "json_rpc_url", String).unwrap_or_else(|_| config.json_rpc_url.clone());
2020-03-10 11:25:40 -07:00
let validator_identity_pubkeys: Vec<_> = pubkeys_of(&matches, "validator_identities")
.unwrap_or_else(Vec::new)
2020-03-09 12:55:31 -07:00
.into_iter()
.map(|i| i.to_string())
.collect();
let no_duplicate_notifications = matches.is_present("no_duplicate_notifications");
2020-03-10 11:25:40 -07:00
let monitor_active_stake = matches.is_present("monitor_active_stake");
2019-12-11 16:05:10 -08:00
let config = Config {
2020-03-10 11:41:51 -07:00
interval,
json_rpc_url,
validator_identity_pubkeys,
no_duplicate_notifications,
monitor_active_stake,
address_labels: config.address_labels,
};
info!("RPC URL: {}", config.json_rpc_url);
if !config.validator_identity_pubkeys.is_empty() {
info!(
"Monitored validators: {:?}",
config.validator_identity_pubkeys
);
2020-03-10 11:41:51 -07:00
}
config
2020-03-10 11:41:51 -07:00
}
fn get_cluster_info(rpc_client: &RpcClient) -> ClientResult<(u64, Hash, RpcVoteAccountStatus)> {
2020-03-10 11:41:51 -07:00
let transaction_count = rpc_client.get_transaction_count()?;
let recent_blockhash = rpc_client.get_recent_blockhash()?.0;
let vote_accounts = rpc_client.get_vote_accounts()?;
Ok((transaction_count, recent_blockhash, vote_accounts))
}
fn main() -> Result<(), Box<dyn error::Error>> {
let config = get_config();
2020-01-08 09:19:12 -08:00
solana_logger::setup_with_default("solana=info");
2019-12-11 16:05:10 -08:00
solana_metrics::set_panic_hook("watchtower");
let rpc_client = RpcClient::new(config.json_rpc_url.clone());
let notifier = Notifier::default();
2019-12-11 16:05:10 -08:00
let mut last_transaction_count = 0;
2020-03-09 22:02:40 -07:00
let mut last_recent_blockhash = Hash::default();
let mut last_notification_msg = "".into();
let mut last_success = Instant::now();
2020-03-09 22:02:40 -07:00
2019-12-11 16:05:10 -08:00
loop {
2020-03-09 22:02:40 -07:00
let failure = match get_cluster_info(&rpc_client) {
Ok((transaction_count, recent_blockhash, vote_accounts)) => {
2019-12-11 16:05:10 -08:00
info!("Current transaction count: {}", transaction_count);
2020-03-09 22:02:40 -07:00
info!("Recent blockhash: {}", recent_blockhash);
info!("Current validator count: {}", vote_accounts.current.len());
info!(
"Delinquent validator count: {}",
vote_accounts.delinquent.len()
);
let mut failures = vec![];
let total_current_stake = vote_accounts
.current
.iter()
2020-05-19 18:13:41 -07:00
.map(|vote_account| vote_account.activated_stake)
.sum();
2020-03-09 22:02:40 -07:00
let total_delinquent_stake = vote_accounts
.delinquent
.iter()
2020-05-19 18:13:41 -07:00
.map(|vote_account| vote_account.activated_stake)
.sum();
2020-03-09 22:02:40 -07:00
let total_stake = total_current_stake + total_delinquent_stake;
let current_stake_percent = total_current_stake * 100 / total_stake;
info!(
"Current stake: {}% | Total stake: {} SOL, current stake: {} SOL, delinquent: {} SOL",
current_stake_percent,
lamports_to_sol(total_stake),
lamports_to_sol(total_current_stake),
lamports_to_sol(total_delinquent_stake)
);
2019-12-11 16:05:10 -08:00
if transaction_count > last_transaction_count {
last_transaction_count = transaction_count;
} else {
2020-03-09 22:02:40 -07:00
failures.push((
"transaction-count",
2019-12-11 16:05:10 -08:00
format!(
"Transaction count is not advancing: {} <= {}",
transaction_count, last_transaction_count
),
2020-03-09 22:02:40 -07:00
));
2019-12-11 16:05:10 -08:00
}
2020-03-09 12:55:31 -07:00
2020-03-09 22:02:40 -07:00
if recent_blockhash != last_recent_blockhash {
last_recent_blockhash = recent_blockhash;
} else {
failures.push((
"recent-blockhash",
format!("Unable to get new blockhash: {}", recent_blockhash),
));
}
2020-03-10 11:41:51 -07:00
if config.monitor_active_stake && current_stake_percent < 80 {
2020-03-10 11:25:40 -07:00
failures.push((
"current-stake",
format!("Current stake is {}%", current_stake_percent),
));
}
2020-03-10 11:41:51 -07:00
if config.validator_identity_pubkeys.is_empty() {
2020-03-09 22:02:40 -07:00
if !vote_accounts.delinquent.is_empty() {
failures.push((
"delinquent",
format!("{} delinquent validators", vote_accounts.delinquent.len()),
));
2019-12-11 16:05:10 -08:00
}
} else {
2020-03-09 22:02:40 -07:00
let mut errors = vec![];
2020-03-10 11:41:51 -07:00
for validator_identity in config.validator_identity_pubkeys.iter() {
let formatted_validator_identity =
format_labeled_address(&validator_identity, &config.address_labels);
2020-03-09 22:02:40 -07:00
if vote_accounts
.delinquent
.iter()
.any(|vai| vai.node_pubkey == *validator_identity)
{
errors.push(format!("{} delinquent", formatted_validator_identity));
2020-03-09 22:02:40 -07:00
} else if !vote_accounts
.current
.iter()
.any(|vai| vai.node_pubkey == *validator_identity)
{
errors.push(format!("{} missing", formatted_validator_identity));
2020-03-09 22:02:40 -07:00
}
rpc_client
.get_balance(&Pubkey::from_str(&validator_identity).unwrap_or_default())
.map(lamports_to_sol)
.map(|balance| {
if balance < 10.0 {
// At 1 SOL/day for validator voting fees, this gives over a week to
// find some more SOL
failures.push((
"balance",
format!(
"{} has {} SOL",
formatted_validator_identity, balance
),
));
}
})
.unwrap_or_else(|err| {
warn!(
"Failed to get balance of {}: {:?}",
formatted_validator_identity, err
);
});
2020-03-09 22:02:40 -07:00
}
if !errors.is_empty() {
failures.push(("delinquent", errors.join(",")));
}
}
2020-03-09 22:02:40 -07:00
for failure in failures.iter() {
error!("{} sanity failure: {}", failure.0, failure.1);
}
2020-03-09 22:02:40 -07:00
failures.into_iter().next() // Only report the first failure if any
}
Err(err) => Some(("rpc", err.to_string())),
};
datapoint_info!("watchtower-sanity", ("ok", failure.is_none(), bool));
if let Some((failure_test_name, failure_error_message)) = &failure {
let notification_msg = format!(
"solana-watchtower: Error: {}: {}",
failure_test_name, failure_error_message
);
2020-03-10 11:41:51 -07:00
if !config.no_duplicate_notifications || last_notification_msg != notification_msg {
2020-03-09 22:02:40 -07:00
notifier.send(&notification_msg);
}
2020-03-09 22:02:40 -07:00
datapoint_error!(
"watchtower-sanity-failure",
("test", failure_test_name, String),
("err", failure_error_message, String)
);
last_notification_msg = notification_msg;
} else {
2020-03-09 22:02:40 -07:00
if !last_notification_msg.is_empty() {
let alarm_duration = Instant::now().duration_since(last_success);
let alarm_duration = alarm_duration - config.interval; // Subtract the period before the first error
let alarm_duration = Duration::from_secs(alarm_duration.as_secs()); // Drop milliseconds in message
let all_clear_msg = format!(
"All clear after {}",
humantime::format_duration(alarm_duration)
);
info!("{}", all_clear_msg);
notifier.send(&format!("solana-watchtower: {}", all_clear_msg));
}
2020-03-09 22:02:40 -07:00
last_notification_msg = "".into();
last_success = Instant::now();
}
2020-03-10 11:41:51 -07:00
sleep(config.interval);
2019-12-11 16:05:10 -08:00
}
}