watchtower: flag to suppress duplicate notifications (#8549)

* watchtower: send error message as notification

* watchtower: send all clear notification when ok again

* watchtower: add twilio sms notifications

* watchtower: flag to suppress duplicate notifications

* remove trailing space character

* changes as per suggestion on PR

* all changes together

* cargo fmt
This commit is contained in:
HM 2020-03-03 06:37:57 +00:00 committed by GitHub
parent d86103383a
commit b6553357f9
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
3 changed files with 115 additions and 1 deletions

View File

@ -8,6 +8,11 @@ If you only care about the health of one specific validator, the
`--validator-identity` command-line argument can be used to restrict failure `--validator-identity` command-line argument can be used to restrict failure
notifications to issues only affecting that validator. notifications to issues only affecting that validator.
If you do not want duplicate notifications, for example if you have elected to
recieve notifications by SMS the
`--no-duplicate-notifications` command-line argument will suppress identical
failure notifications.
### Metrics ### Metrics
#### `watchtower-sanity` #### `watchtower-sanity`
On every iteration this data point will be emitted indicating the overall result On every iteration this data point will be emitted indicating the overall result
@ -33,3 +38,10 @@ Telegram requires the following two variables:
export TELEGRAM_BOT_TOKEN=... export TELEGRAM_BOT_TOKEN=...
export TELEGRAM_CHAT_ID=... export TELEGRAM_CHAT_ID=...
``` ```
To receive a Twilio SMS notification on failure, having a Twilio account,
and a sending number owned by that account,
define environment variable before running `solana-watchtower`:
```
export TWILIO_CONFIG='ACCOUNT=<account>,TOKEN=<securityToken>,TO=<receivingNumber>,FROM=<sendingNumber>'
```

View File

@ -42,11 +42,18 @@ fn main() -> Result<(), Box<dyn error::Error>> {
.validator(is_pubkey_or_keypair) .validator(is_pubkey_or_keypair)
.help("Monitor a specific validator only instead of the entire cluster"), .help("Monitor a specific validator only instead of the entire cluster"),
) )
.arg(
Arg::with_name("no_duplicate_notifications")
.long("no-duplicate-notifications")
.takes_value(false)
.help("Subsequent identical notifications will be suppressed"),
)
.get_matches(); .get_matches();
let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64)); let interval = Duration::from_secs(value_t_or_exit!(matches, "interval", u64));
let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String); let json_rpc_url = value_t_or_exit!(matches, "json_rpc_url", String);
let validator_identity = pubkey_of(&matches, "validator_identity").map(|i| i.to_string()); let validator_identity = pubkey_of(&matches, "validator_identity").map(|i| i.to_string());
let no_duplicate_notifications = matches.is_present("no_duplicate_notifications");
solana_logger::setup_with_default("solana=info"); solana_logger::setup_with_default("solana=info");
solana_metrics::set_panic_hook("watchtower"); solana_metrics::set_panic_hook("watchtower");
@ -55,7 +62,10 @@ fn main() -> Result<(), Box<dyn error::Error>> {
let notifier = Notifier::new(); let notifier = Notifier::new();
let mut last_transaction_count = 0; let mut last_transaction_count = 0;
let mut last_check_notification_sent = false;
let mut last_notification_msg = String::from("");
loop { loop {
let mut notify_msg = String::from("solana-watchtower: undefined error");
let ok = rpc_client let ok = rpc_client
.get_transaction_count() .get_transaction_count()
.and_then(|transaction_count| { .and_then(|transaction_count| {
@ -75,6 +85,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
} }
}) })
.unwrap_or_else(|err| { .unwrap_or_else(|err| {
notify_msg = format!("solana-watchtower: {}", err.to_string());
datapoint_error!( datapoint_error!(
"watchtower-sanity-failure", "watchtower-sanity-failure",
("test", "transaction-count", String), ("test", "transaction-count", String),
@ -93,6 +104,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
Ok(true) Ok(true)
}) })
.unwrap_or_else(|err| { .unwrap_or_else(|err| {
notify_msg = format!("solana-watchtower: {}", err.to_string());
datapoint_error!( datapoint_error!(
"watchtower-sanity-failure", "watchtower-sanity-failure",
("test", "blockhash", String), ("test", "blockhash", String),
@ -149,6 +161,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
} }
}) })
.unwrap_or_else(|err| { .unwrap_or_else(|err| {
notify_msg = format!("solana-watchtower: {}", err.to_string());
datapoint_error!( datapoint_error!(
"watchtower-sanity-failure", "watchtower-sanity-failure",
("test", "delinquent-validators", String), ("test", "delinquent-validators", String),
@ -159,7 +172,26 @@ fn main() -> Result<(), Box<dyn error::Error>> {
datapoint_info!("watchtower-sanity", ("ok", ok, bool)); datapoint_info!("watchtower-sanity", ("ok", ok, bool));
if !ok { if !ok {
notifier.send("solana-watchtower sanity failure"); last_check_notification_sent = true;
if no_duplicate_notifications {
if last_notification_msg != notify_msg {
notifier.send(&notify_msg);
last_notification_msg = notify_msg;
} else {
datapoint_info!(
"watchtower-sanity",
("Suppressing duplicate notification", ok, bool)
);
}
} else {
notifier.send(&notify_msg);
}
} else {
if last_check_notification_sent {
notifier.send("solana-watchtower: All Clear");
}
last_check_notification_sent = false;
last_notification_msg = String::from("");
} }
sleep(interval); sleep(interval);
} }

View File

@ -8,11 +8,60 @@ struct TelegramWebHook {
chat_id: String, chat_id: String,
} }
#[derive(Debug, Default)]
struct TwilioWebHook {
account: String,
token: String,
to: String,
from: String,
}
impl TwilioWebHook {
fn complete(&self) -> bool {
!(self.account.is_empty()
|| self.token.is_empty()
|| self.to.is_empty()
|| self.from.is_empty())
}
}
fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
let config_var = env::var("TWILIO_CONFIG");
if config_var.is_err() {
info!("Twilio notifications disabled");
return Ok(None);
}
let mut config = TwilioWebHook::default();
for pair in config_var.unwrap().split(',') {
let nv: Vec<_> = pair.split('=').collect();
if nv.len() != 2 {
return Err(format!("TWILIO_CONFIG is invalid: '{}'", pair));
}
let v = nv[1].to_string();
match nv[0] {
"ACCOUNT" => config.account = v,
"TOKEN" => config.token = v,
"TO" => config.to = v,
"FROM" => config.from = v,
_ => return Err(format!("TWILIO_CONFIG is invalid: '{}'", pair)),
}
}
if !config.complete() {
return Err("TWILIO_CONFIG is incomplete".to_string());
}
Ok(Some(config))
}
pub struct Notifier { pub struct Notifier {
client: Client, client: Client,
discord_webhook: Option<String>, discord_webhook: Option<String>,
slack_webhook: Option<String>, slack_webhook: Option<String>,
telegram_webhook: Option<TelegramWebHook>, telegram_webhook: Option<TelegramWebHook>,
twilio_webhook: Option<TwilioWebHook>,
} }
impl Notifier { impl Notifier {
@ -35,12 +84,16 @@ impl Notifier {
info!("Telegram notifications disabled"); info!("Telegram notifications disabled");
None None
}; };
let twilio_webhook = get_twilio_config()
.map_err(|err| panic!("Twilio config error: {}", err))
.unwrap();
Notifier { Notifier {
client: Client::new(), client: Client::new(),
discord_webhook, discord_webhook,
slack_webhook, slack_webhook,
telegram_webhook, telegram_webhook,
twilio_webhook,
} }
} }
@ -67,5 +120,22 @@ impl Notifier {
warn!("Failed to send Telegram message: {:?}", err); warn!("Failed to send Telegram message: {:?}", err);
} }
} }
if let Some(TwilioWebHook {
account,
token,
to,
from,
}) = &self.twilio_webhook
{
let url = format!(
"https://{}:{}@api.twilio.com/2010-04-01/Accounts/{}/Messages.json",
account, token, account
);
let params = [("To", to), ("From", from), ("Body", &msg.to_string())];
if let Err(err) = self.client.post(&url).form(&params).send() {
warn!("Failed to send Twilio message: {:?}", err);
}
}
} }
} }