solana/watchtower/src/main.rs

475 lines
18 KiB
Rust
Raw Normal View History

2019-12-11 16:05:10 -08:00
//! A command-line executable for monitoring the health of a cluster
2020-03-10 11:41:51 -07:00
use clap::{crate_description, crate_name, value_t, value_t_or_exit, App, Arg};
2019-12-11 16:05:10 -08:00
use log::*;
use solana_clap_utils::{
2020-03-09 12:55:31 -07:00
input_parsers::pubkeys_of,
input_validators::{is_pubkey_or_keypair, is_url},
};
use solana_cli_output::display::{format_labeled_address, write_transaction};
use solana_client::{
client_error::Result as ClientResult, rpc_client::RpcClient, rpc_response::RpcVoteAccountStatus,
};
2019-12-11 16:05:10 -08:00
use solana_metrics::{datapoint_error, datapoint_info};
use solana_notifier::Notifier;
use solana_sdk::{
clock::Slot, hash::Hash, native_token::lamports_to_sol, program_utils::limited_deserialize,
pubkey::Pubkey,
};
use solana_transaction_status::{EncodedConfirmedBlock, UiTransactionEncoding};
use solana_vote_program::vote_instruction::VoteInstruction;
use std::{
collections::HashMap,
error,
str::FromStr,
thread::sleep,
time::{Duration, Instant},
};
2019-12-11 16:05:10 -08:00
2020-03-10 11:41:51 -07:00
struct Config {
interval: Duration,
json_rpc_url: String,
validator_identity_pubkeys: Vec<String>,
no_duplicate_notifications: bool,
monitor_active_stake: bool,
notify_on_transactions: bool,
address_labels: HashMap<String, String>,
2020-03-09 22:02:40 -07:00
}
2020-03-10 11:41:51 -07:00
fn get_config() -> Config {
let matches = App::new(crate_name!())
2019-12-11 16:05:10 -08:00
.about(crate_description!())
.version(solana_version::version!())
.after_help("ADDITIONAL HELP:
To receive a Slack, Discord and/or Telegram notification on sanity failure,
define environment variables before running `solana-watchtower`:
export SLACK_WEBHOOK=...
export DISCORD_WEBHOOK=...
2020-04-17 00:23:17 -07:00
Telegram requires the following two variables:
export TELEGRAM_BOT_TOKEN=...
export TELEGRAM_CHAT_ID=...
2020-04-17 00:23:17 -07:00
To receive a Twilio SMS notification on failure, having a Twilio account,
and a sending number owned by that account,
define environment variable before running `solana-watchtower`:
export TWILIO_CONFIG='ACCOUNT=<account>,TOKEN=<securityToken>,TO=<receivingNumber>,FROM=<sendingNumber>'")
2020-03-09 12:35:32 -07:00
.arg({
let arg = Arg::with_name("config_file")
.short("C")
.long("config")
.value_name("PATH")
.takes_value(true)
.global(true)
.help("Configuration file to use");
2020-03-10 11:41:51 -07:00
if let Some(ref config_file) = *solana_cli_config::CONFIG_FILE {
2020-03-09 12:35:32 -07:00
arg.default_value(&config_file)
} else {
arg
}
})
2019-12-11 16:05:10 -08:00
.arg(
Arg::with_name("json_rpc_url")
.long("url")
.value_name("URL")
.takes_value(true)
.validator(is_url)
.help("JSON RPC URL for the cluster"),
)
.arg(
Arg::with_name("interval")
.long("interval")
.value_name("SECONDS")
.takes_value(true)
.default_value("60")
.help("Wait interval seconds between checking the cluster"),
)
.arg(
2020-03-10 11:25:40 -07:00
Arg::with_name("validator_identities")
.long("validator-identity")
.value_name("VALIDATOR IDENTITY PUBKEY")
.takes_value(true)
.validator(is_pubkey_or_keypair)
2020-03-09 12:55:31 -07:00
.multiple(true)
.help("Monitor a specific validator only instead of the entire cluster"),
)
.arg(
Arg::with_name("no_duplicate_notifications")
.long("no-duplicate-notifications")
.takes_value(false)
.help("Subsequent identical notifications will be suppressed"),
)
2020-03-10 11:25:40 -07:00
.arg(
Arg::with_name("monitor_active_stake")
.long("monitor-active-stake")
.takes_value(false)
.help("Alert when the current stake for the cluster drops below 80%"),
)
.arg(
Arg::with_name("notify_on_transactions")
.long("notify-on-transactions")
.takes_value(false)
.help("Send a notification on all non-vote transactions. This can be very verbose!\
Note that the notification environment variables used by this feature all require a \
TRANSACTION_NOTIFIER_ prefix. For example: TRANSACTION_NOTIFIER_SLACK_WEBHOOK"),
)
2020-03-10 11:41:51 -07:00
.get_matches();
2019-12-11 16:05:10 -08:00
2020-03-09 12:35:32 -07:00
let config = if let Some(config_file) = matches.value_of("config_file") {
2020-03-10 11:41:51 -07:00
solana_cli_config::Config::load(config_file).unwrap_or_default()
2020-03-09 12:35:32 -07:00
} else {
2020-03-10 11:41:51 -07:00
solana_cli_config::Config::default()
2020-03-09 12:35:32 -07:00
};
2019-12-11 16:05:10 -08:00
let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
2020-03-09 12:35:32 -07:00
let json_rpc_url =
value_t!(matches, "json_rpc_url", String).unwrap_or_else(|_| config.json_rpc_url.clone());
2020-03-10 11:25:40 -07:00
let validator_identity_pubkeys: Vec<_> = pubkeys_of(&matches, "validator_identities")
.unwrap_or_else(Vec::new)
2020-03-09 12:55:31 -07:00
.into_iter()
.map(|i| i.to_string())
.collect();
let no_duplicate_notifications = matches.is_present("no_duplicate_notifications");
2020-03-10 11:25:40 -07:00
let monitor_active_stake = matches.is_present("monitor_active_stake");
let notify_on_transactions = matches.is_present("notify_on_transactions");
2019-12-11 16:05:10 -08:00
let config = Config {
2020-03-10 11:41:51 -07:00
interval,
json_rpc_url,
validator_identity_pubkeys,
no_duplicate_notifications,
monitor_active_stake,
notify_on_transactions,
address_labels: config.address_labels,
};
info!("RPC URL: {}", config.json_rpc_url);
if !config.validator_identity_pubkeys.is_empty() {
info!(
"Monitored validators: {:?}",
config.validator_identity_pubkeys
);
2020-03-10 11:41:51 -07:00
}
config
2020-03-10 11:41:51 -07:00
}
fn process_confirmed_block(
notifier: &Notifier,
slot: Slot,
confirmed_block: EncodedConfirmedBlock,
) {
2020-06-17 13:53:45 -07:00
let break_program_id = "BrEAK7zGZ6dM71zUDACDqJnekihmwF15noTddWTsknjC"
.parse::<Pubkey>()
.unwrap();
let mut vote_transactions = 0;
for rpc_transaction in &confirmed_block.transactions {
if let Some(transaction) = rpc_transaction.transaction.decode() {
if transaction.verify().is_ok() {
let mut notify = true;
// Ignore simple Vote transactions since they are too prevalent
if transaction.message.instructions.len() == 1 {
let instruction = &transaction.message.instructions[0];
let program_pubkey =
transaction.message.account_keys[instruction.program_id_index as usize];
if program_pubkey == solana_vote_program::id() {
if let Ok(VoteInstruction::Vote(_)) =
limited_deserialize::<VoteInstruction>(&instruction.data)
{
vote_transactions += 1;
notify = false;
}
}
2020-06-17 13:53:45 -07:00
if program_pubkey == break_program_id {
notify = false;
}
}
if notify {
let mut w = Vec::new();
if write_transaction(&mut w, &transaction, &rpc_transaction.meta, "").is_ok() {
if let Ok(s) = String::from_utf8(w) {
notifier.send(&format!("```Slot: {}\n{}```", slot, s));
}
}
}
} else {
datapoint_error!(
"watchtower-sanity-failure",
("slot", slot, i64),
("err", "Transaction signature verification failed", String)
);
}
}
}
info!(
"Process slot {} with {} regular transactions (and {} votes)",
slot,
confirmed_block.transactions.len() - vote_transactions,
vote_transactions
);
}
fn load_blocks(
rpc_client: &RpcClient,
start_slot: Slot,
end_slot: Slot,
) -> ClientResult<Vec<(Slot, EncodedConfirmedBlock)>> {
info!(
"Loading confirmed blocks between slots: {} - {}",
start_slot, end_slot
);
let slots = rpc_client.get_confirmed_blocks(start_slot, Some(end_slot))?;
let mut blocks = vec![];
for slot in slots.into_iter() {
let block =
rpc_client.get_confirmed_block_with_encoding(slot, UiTransactionEncoding::Base64)?;
blocks.push((slot, block));
}
Ok(blocks)
}
fn transaction_monitor(rpc_client: RpcClient) {
let notifier = Notifier::new("TRANSACTION_NOTIFIER_");
let mut start_slot = loop {
match rpc_client.get_slot() {
Ok(slot) => break slot,
Err(err) => {
warn!("Failed to get current slot: {}", err);
}
}
sleep(Duration::from_secs(1));
};
loop {
let end_slot = start_slot + 50;
info!("start_slot:{} - end_slot:{}", start_slot, end_slot);
let latest_available_slot = rpc_client.get_slot().unwrap_or_else(|err| {
info!("get_slot() failed: {}", err);
0
});
if latest_available_slot <= start_slot {
info!("Waiting for a slot greater than {}...", start_slot);
sleep(Duration::from_secs(5));
continue;
}
match load_blocks(&rpc_client, start_slot + 1, end_slot) {
Ok(blocks) => {
info!("Loaded {} blocks", blocks.len());
if blocks.is_empty() && end_slot < latest_available_slot {
start_slot = end_slot;
} else {
for (slot, block) in blocks.into_iter() {
process_confirmed_block(&notifier, slot, block);
start_slot = slot;
}
}
}
Err(err) => {
info!(
"failed to get blocks in range ({},{}): {}",
start_slot, end_slot, err
);
sleep(Duration::from_secs(1));
}
}
}
}
fn get_cluster_info(rpc_client: &RpcClient) -> ClientResult<(u64, Hash, RpcVoteAccountStatus)> {
2020-03-10 11:41:51 -07:00
let transaction_count = rpc_client.get_transaction_count()?;
let recent_blockhash = rpc_client.get_recent_blockhash()?.0;
let vote_accounts = rpc_client.get_vote_accounts()?;
Ok((transaction_count, recent_blockhash, vote_accounts))
}
fn main() -> Result<(), Box<dyn error::Error>> {
let config = get_config();
2020-01-08 09:19:12 -08:00
solana_logger::setup_with_default("solana=info");
2019-12-11 16:05:10 -08:00
solana_metrics::set_panic_hook("watchtower");
let _notify_thread = if config.notify_on_transactions {
let rpc_client = RpcClient::new(config.json_rpc_url.clone());
Some(std::thread::spawn(move || transaction_monitor(rpc_client)))
} else {
None
};
2019-12-11 16:05:10 -08:00
let rpc_client = RpcClient::new(config.json_rpc_url.clone());
let notifier = Notifier::default();
2019-12-11 16:05:10 -08:00
let mut last_transaction_count = 0;
2020-03-09 22:02:40 -07:00
let mut last_recent_blockhash = Hash::default();
let mut last_notification_msg = "".into();
let mut last_success = Instant::now();
2020-03-09 22:02:40 -07:00
2019-12-11 16:05:10 -08:00
loop {
2020-03-09 22:02:40 -07:00
let failure = match get_cluster_info(&rpc_client) {
Ok((transaction_count, recent_blockhash, vote_accounts)) => {
2019-12-11 16:05:10 -08:00
info!("Current transaction count: {}", transaction_count);
2020-03-09 22:02:40 -07:00
info!("Recent blockhash: {}", recent_blockhash);
info!("Current validator count: {}", vote_accounts.current.len());
info!(
"Delinquent validator count: {}",
vote_accounts.delinquent.len()
);
let mut failures = vec![];
let total_current_stake = vote_accounts
.current
.iter()
2020-05-19 18:13:41 -07:00
.map(|vote_account| vote_account.activated_stake)
.sum();
2020-03-09 22:02:40 -07:00
let total_delinquent_stake = vote_accounts
.delinquent
.iter()
2020-05-19 18:13:41 -07:00
.map(|vote_account| vote_account.activated_stake)
.sum();
2020-03-09 22:02:40 -07:00
let total_stake = total_current_stake + total_delinquent_stake;
let current_stake_percent = total_current_stake * 100 / total_stake;
info!(
"Current stake: {}% | Total stake: {} SOL, current stake: {} SOL, delinquent: {} SOL",
current_stake_percent,
lamports_to_sol(total_stake),
lamports_to_sol(total_current_stake),
lamports_to_sol(total_delinquent_stake)
);
2019-12-11 16:05:10 -08:00
if transaction_count > last_transaction_count {
last_transaction_count = transaction_count;
} else {
2020-03-09 22:02:40 -07:00
failures.push((
"transaction-count",
2019-12-11 16:05:10 -08:00
format!(
"Transaction count is not advancing: {} <= {}",
transaction_count, last_transaction_count
),
2020-03-09 22:02:40 -07:00
));
2019-12-11 16:05:10 -08:00
}
2020-03-09 12:55:31 -07:00
2020-03-09 22:02:40 -07:00
if recent_blockhash != last_recent_blockhash {
last_recent_blockhash = recent_blockhash;
} else {
failures.push((
"recent-blockhash",
format!("Unable to get new blockhash: {}", recent_blockhash),
));
}
2020-03-10 11:41:51 -07:00
if config.monitor_active_stake && current_stake_percent < 80 {
2020-03-10 11:25:40 -07:00
failures.push((
"current-stake",
format!("Current stake is {}%", current_stake_percent),
));
}
2020-03-10 11:41:51 -07:00
if config.validator_identity_pubkeys.is_empty() {
2020-03-09 22:02:40 -07:00
if !vote_accounts.delinquent.is_empty() {
failures.push((
"delinquent",
format!("{} delinquent validators", vote_accounts.delinquent.len()),
));
2019-12-11 16:05:10 -08:00
}
} else {
2020-03-09 22:02:40 -07:00
let mut errors = vec![];
2020-03-10 11:41:51 -07:00
for validator_identity in config.validator_identity_pubkeys.iter() {
let formatted_validator_identity =
format_labeled_address(&validator_identity, &config.address_labels);
2020-03-09 22:02:40 -07:00
if vote_accounts
.delinquent
.iter()
.any(|vai| vai.node_pubkey == *validator_identity)
{
errors.push(format!("{} delinquent", formatted_validator_identity));
2020-03-09 22:02:40 -07:00
} else if !vote_accounts
.current
.iter()
.any(|vai| vai.node_pubkey == *validator_identity)
{
errors.push(format!("{} missing", formatted_validator_identity));
2020-03-09 22:02:40 -07:00
}
rpc_client
.get_balance(&Pubkey::from_str(&validator_identity).unwrap_or_default())
.map(lamports_to_sol)
.map(|balance| {
if balance < 10.0 {
// At 1 SOL/day for validator voting fees, this gives over a week to
// find some more SOL
failures.push((
"balance",
format!(
"{} has {} SOL",
formatted_validator_identity, balance
),
));
}
})
.unwrap_or_else(|err| {
warn!(
"Failed to get balance of {}: {:?}",
formatted_validator_identity, err
);
});
2020-03-09 22:02:40 -07:00
}
if !errors.is_empty() {
failures.push(("delinquent", errors.join(",")));
}
}
2020-03-09 22:02:40 -07:00
for failure in failures.iter() {
error!("{} sanity failure: {}", failure.0, failure.1);
}
2020-03-09 22:02:40 -07:00
failures.into_iter().next() // Only report the first failure if any
}
Err(err) => Some(("rpc", err.to_string())),
};
datapoint_info!("watchtower-sanity", ("ok", failure.is_none(), bool));
if let Some((failure_test_name, failure_error_message)) = &failure {
let notification_msg = format!(
"solana-watchtower: Error: {}: {}",
failure_test_name, failure_error_message
);
2020-03-10 11:41:51 -07:00
if !config.no_duplicate_notifications || last_notification_msg != notification_msg {
2020-03-09 22:02:40 -07:00
notifier.send(&notification_msg);
}
2020-03-09 22:02:40 -07:00
datapoint_error!(
"watchtower-sanity-failure",
("test", failure_test_name, String),
("err", failure_error_message, String)
);
last_notification_msg = notification_msg;
} else {
2020-03-09 22:02:40 -07:00
if !last_notification_msg.is_empty() {
let alarm_duration = Instant::now().duration_since(last_success);
let alarm_duration = Duration::from_secs(alarm_duration.as_secs()); // Drop milliseconds in message
let all_clear_msg = format!(
"All clear after {}",
humantime::format_duration(alarm_duration)
);
info!("{}", all_clear_msg);
notifier.send(&format!("solana-watchtower: {}", all_clear_msg));
}
2020-03-09 22:02:40 -07:00
last_notification_msg = "".into();
last_success = Instant::now();
}
2020-03-10 11:41:51 -07:00
sleep(config.interval);
2019-12-11 16:05:10 -08:00
}
}