From 2db28cae416ad05fda313a9a8a98b7cfee57b7d6 Mon Sep 17 00:00:00 2001 From: Michael Vines Date: Wed, 11 Dec 2019 17:05:10 -0700 Subject: [PATCH] Add solana-watchtower program --- Cargo.lock | 13 +++++ Cargo.toml | 1 + watchtower/.gitignore | 2 + watchtower/Cargo.toml | 23 ++++++++ watchtower/README.md | 16 ++++++ watchtower/src/main.rs | 116 +++++++++++++++++++++++++++++++++++++++++ 6 files changed, 171 insertions(+) create mode 100644 watchtower/.gitignore create mode 100644 watchtower/Cargo.toml create mode 100644 watchtower/README.md create mode 100644 watchtower/src/main.rs diff --git a/Cargo.lock b/Cargo.lock index 01023a5aa..91b8bb187 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4090,6 +4090,19 @@ dependencies = [ "solana-sdk 0.22.0", ] +[[package]] +name = "solana-watchtower" +version = "0.22.0" +dependencies = [ + "clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)", + "log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)", + "solana-clap-utils 0.22.0", + "solana-client 0.22.0", + "solana-logger 0.22.0", + "solana-metrics 0.22.0", + "solana-sdk 0.22.0", +] + [[package]] name = "solana_libra_bytecode_verifier" version = "0.0.1-sol4" diff --git a/Cargo.toml b/Cargo.toml index a4176ee31..529f8ca02 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -49,6 +49,7 @@ members = [ "vote-signer", "cli", "rayon-threadlimit", + "watchtower", ] exclude = [ diff --git a/watchtower/.gitignore b/watchtower/.gitignore new file mode 100644 index 000000000..5404b132d --- /dev/null +++ b/watchtower/.gitignore @@ -0,0 +1,2 @@ +/target/ +/farf/ diff --git a/watchtower/Cargo.toml b/watchtower/Cargo.toml new file mode 100644 index 000000000..35fde38bd --- /dev/null +++ b/watchtower/Cargo.toml @@ -0,0 +1,23 @@ +[package] +authors = ["Solana Maintainers "] +edition = "2018" +name = "solana-watchtower" +description = "Blockchain, Rebuilt for Scale" +version = "0.22.0" +repository = "https://github.com/solana-labs/solana" +license = "Apache-2.0" +homepage = "https://solana.com/" + +[dependencies] +clap = "2.33.0" +log = "0.4.8" +solana-clap-utils = { path = "../clap-utils", version = "0.22.0" } +solana-client = { path = "../client", version = "0.22.0" } +solana-logger = { path = "../logger", version = "0.22.0" } +solana-metrics = { path = "../metrics", version = "0.22.0" } +solana-sdk = { path = "../sdk", version = "0.22.0" } + +[[bin]] +name = "solana-watchtower" +path = "src/main.rs" + diff --git a/watchtower/README.md b/watchtower/README.md new file mode 100644 index 000000000..1779eb252 --- /dev/null +++ b/watchtower/README.md @@ -0,0 +1,16 @@ +The `solana-watchtower` program is used to monitor the health of a cluster. It +periodically polls the cluster over an RPC API to confirm that the transaction +count is advancing, new blockhashes are available, and no validators are +delinquent. Results are reported as InfluxDB metrics. + +### Metrics +#### `watchtower-sanity` +On every iteration this data point will be emitted indicating the overall result +using a boolean `ok` field. + +#### `watchtower-sanity-failure` +On failure this data point contains details about the specific test that failed via +the following fields: +* `test`: name of the sanity test that failed +* `err`: exact sanity failure message + diff --git a/watchtower/src/main.rs b/watchtower/src/main.rs new file mode 100644 index 000000000..81aca4466 --- /dev/null +++ b/watchtower/src/main.rs @@ -0,0 +1,116 @@ +//! A command-line executable for monitoring the health of a cluster + +use clap::{crate_description, crate_name, value_t_or_exit, App, Arg}; +use log::*; +use solana_clap_utils::input_validators::is_url; +use solana_client::rpc_client::RpcClient; +use solana_metrics::{datapoint_error, datapoint_info}; +use std::{error, io, thread::sleep, time::Duration}; + +fn main() -> Result<(), Box> { + let matches = App::new(crate_name!()) + .about(crate_description!()) + .version(solana_clap_utils::version!()) + .arg( + Arg::with_name("json_rpc_url") + .long("url") + .value_name("URL") + .takes_value(true) + .required(true) + .validator(is_url) + .help("JSON RPC URL for the cluster"), + ) + .arg( + Arg::with_name("interval") + .long("interval") + .value_name("SECONDS") + .takes_value(true) + .default_value("60") + .help("Wait interval seconds between checking the cluster"), + ) + .get_matches(); + + let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64)); + let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String); + + solana_logger::setup_with_filter("solana=info"); + solana_metrics::set_panic_hook("watchtower"); + + let rpc_client = RpcClient::new(json_rpc_url.to_string()); + + let mut last_transaction_count = 0; + loop { + let ok = rpc_client + .get_transaction_count() + .and_then(|transaction_count| { + info!("Current transaction count: {}", transaction_count); + + if transaction_count > last_transaction_count { + last_transaction_count = transaction_count; + Ok(true) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Transaction count is not advancing: {} <= {}", + transaction_count, last_transaction_count + ), + )) + } + }) + .unwrap_or_else(|err| { + datapoint_error!( + "watchtower-sanity-failure", + ("test", "transaction-count", String), + ("err", err.to_string(), String) + ); + false + }) + && rpc_client + .get_recent_blockhash() + .and_then(|(blockhash, _fee_calculator)| { + info!("Current blockhash: {}", blockhash); + rpc_client.get_new_blockhash(&blockhash) + }) + .and_then(|(blockhash, _fee_calculator)| { + info!("New blockhash: {}", blockhash); + Ok(true) + }) + .unwrap_or_else(|err| { + datapoint_error!( + "watchtower-sanity-failure", + ("test", "blockhash", String), + ("err", err.to_string(), String) + ); + false + }) + && rpc_client + .get_vote_accounts() + .and_then(|vote_accounts| { + info!("Current validator count: {}", vote_accounts.current.len()); + info!( + "Delinquent validator count: {}", + vote_accounts.delinquent.len() + ); + if vote_accounts.delinquent.is_empty() { + Ok(true) + } else { + Err(io::Error::new( + io::ErrorKind::Other, + format!("{} delinquent validators", vote_accounts.delinquent.len()), + )) + } + }) + .unwrap_or_else(|err| { + datapoint_error!( + "watchtower-sanity-failure", + ("test", "delinquent-validators", String), + ("err", err.to_string(), String) + ); + false + }); + + datapoint_info!("watchtower-sanity", ("ok", ok, bool)); + sleep(interval); + } +}