Add solana-watchtower program

This commit is contained in:
Michael Vines 2019-12-11 17:05:10 -07:00
parent dd54fff978
commit 2db28cae41
6 changed files with 171 additions and 0 deletions

13
Cargo.lock generated
View File

@ -4090,6 +4090,19 @@ dependencies = [
"solana-sdk 0.22.0",
]
[[package]]
name = "solana-watchtower"
version = "0.22.0"
dependencies = [
"clap 2.33.0 (registry+https://github.com/rust-lang/crates.io-index)",
"log 0.4.8 (registry+https://github.com/rust-lang/crates.io-index)",
"solana-clap-utils 0.22.0",
"solana-client 0.22.0",
"solana-logger 0.22.0",
"solana-metrics 0.22.0",
"solana-sdk 0.22.0",
]
[[package]]
name = "solana_libra_bytecode_verifier"
version = "0.0.1-sol4"

View File

@ -49,6 +49,7 @@ members = [
"vote-signer",
"cli",
"rayon-threadlimit",
"watchtower",
]
exclude = [

2
watchtower/.gitignore vendored Normal file
View File

@ -0,0 +1,2 @@
/target/
/farf/

23
watchtower/Cargo.toml Normal file
View File

@ -0,0 +1,23 @@
[package]
authors = ["Solana Maintainers <maintainers@solana.com>"]
edition = "2018"
name = "solana-watchtower"
description = "Blockchain, Rebuilt for Scale"
version = "0.22.0"
repository = "https://github.com/solana-labs/solana"
license = "Apache-2.0"
homepage = "https://solana.com/"
[dependencies]
clap = "2.33.0"
log = "0.4.8"
solana-clap-utils = { path = "../clap-utils", version = "0.22.0" }
solana-client = { path = "../client", version = "0.22.0" }
solana-logger = { path = "../logger", version = "0.22.0" }
solana-metrics = { path = "../metrics", version = "0.22.0" }
solana-sdk = { path = "../sdk", version = "0.22.0" }
[[bin]]
name = "solana-watchtower"
path = "src/main.rs"

16
watchtower/README.md Normal file
View File

@ -0,0 +1,16 @@
The `solana-watchtower` program is used to monitor the health of a cluster. It
periodically polls the cluster over an RPC API to confirm that the transaction
count is advancing, new blockhashes are available, and no validators are
delinquent. Results are reported as InfluxDB metrics.
### Metrics
#### `watchtower-sanity`
On every iteration this data point will be emitted indicating the overall result
using a boolean `ok` field.
#### `watchtower-sanity-failure`
On failure this data point contains details about the specific test that failed via
the following fields:
* `test`: name of the sanity test that failed
* `err`: exact sanity failure message

116
watchtower/src/main.rs Normal file
View File

@ -0,0 +1,116 @@
//! A command-line executable for monitoring the health of a cluster
use clap::{crate_description, crate_name, value_t_or_exit, App, Arg};
use log::*;
use solana_clap_utils::input_validators::is_url;
use solana_client::rpc_client::RpcClient;
use solana_metrics::{datapoint_error, datapoint_info};
use std::{error, io, thread::sleep, time::Duration};
fn main() -> Result<(), Box<dyn error::Error>> {
let matches = App::new(crate_name!())
.about(crate_description!())
.version(solana_clap_utils::version!())
.arg(
Arg::with_name("json_rpc_url")
.long("url")
.value_name("URL")
.takes_value(true)
.required(true)
.validator(is_url)
.help("JSON RPC URL for the cluster"),
)
.arg(
Arg::with_name("interval")
.long("interval")
.value_name("SECONDS")
.takes_value(true)
.default_value("60")
.help("Wait interval seconds between checking the cluster"),
)
.get_matches();
let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String);
solana_logger::setup_with_filter("solana=info");
solana_metrics::set_panic_hook("watchtower");
let rpc_client = RpcClient::new(json_rpc_url.to_string());
let mut last_transaction_count = 0;
loop {
let ok = rpc_client
.get_transaction_count()
.and_then(|transaction_count| {
info!("Current transaction count: {}", transaction_count);
if transaction_count > last_transaction_count {
last_transaction_count = transaction_count;
Ok(true)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Transaction count is not advancing: {} <= {}",
transaction_count, last_transaction_count
),
))
}
})
.unwrap_or_else(|err| {
datapoint_error!(
"watchtower-sanity-failure",
("test", "transaction-count", String),
("err", err.to_string(), String)
);
false
})
&& rpc_client
.get_recent_blockhash()
.and_then(|(blockhash, _fee_calculator)| {
info!("Current blockhash: {}", blockhash);
rpc_client.get_new_blockhash(&blockhash)
})
.and_then(|(blockhash, _fee_calculator)| {
info!("New blockhash: {}", blockhash);
Ok(true)
})
.unwrap_or_else(|err| {
datapoint_error!(
"watchtower-sanity-failure",
("test", "blockhash", String),
("err", err.to_string(), String)
);
false
})
&& rpc_client
.get_vote_accounts()
.and_then(|vote_accounts| {
info!("Current validator count: {}", vote_accounts.current.len());
info!(
"Delinquent validator count: {}",
vote_accounts.delinquent.len()
);
if vote_accounts.delinquent.is_empty() {
Ok(true)
} else {
Err(io::Error::new(
io::ErrorKind::Other,
format!("{} delinquent validators", vote_accounts.delinquent.len()),
))
}
})
.unwrap_or_else(|err| {
datapoint_error!(
"watchtower-sanity-failure",
("test", "delinquent-validators", String),
("err", err.to_string(), String)
);
false
});
datapoint_info!("watchtower-sanity", ("ok", ok, bool));
sleep(interval);
}
}