Resolve PagerDuty incident on All Clear instead of triggering new incident (#28232)

This commit is contained in:
Michael 2022-10-05 20:55:45 +02:00 committed by GitHub
parent c899ededfc
commit 459c9699b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 22 deletions

1
Cargo.lock generated
View File

@ -5668,6 +5668,7 @@ dependencies = [
"log", "log",
"reqwest", "reqwest",
"serde_json", "serde_json",
"solana-sdk 1.15.0",
] ]
[[package]] [[package]]

View File

@ -13,6 +13,7 @@ edition = "2021"
log = "0.4.17" log = "0.4.17"
reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] } reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
serde_json = "1.0" serde_json = "1.0"
solana-sdk = { path = "../sdk", version = "=1.15.0" }
[lib] [lib]
name = "solana_notifier" name = "solana_notifier"

View File

@ -27,6 +27,7 @@ use log::*;
use { use {
reqwest::{blocking::Client, StatusCode}, reqwest::{blocking::Client, StatusCode},
serde_json::json, serde_json::json,
solana_sdk::hash::Hash,
std::{env, str::FromStr, thread::sleep, time::Duration}, std::{env, str::FromStr, thread::sleep, time::Duration},
}; };
@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
Ok(Some(config)) Ok(Some(config))
} }
enum NotificationType { enum NotificationChannel {
Discord(String), Discord(String),
Slack(String), Slack(String),
PagerDuty(String), PagerDuty(String),
@ -92,9 +93,15 @@ enum NotificationType {
Log(Level), Log(Level),
} }
#[derive(Clone)]
pub enum NotificationType {
Trigger { incident: Hash },
Resolve { incident: Hash },
}
pub struct Notifier { pub struct Notifier {
client: Client, client: Client,
notifiers: Vec<NotificationType>, notifiers: Vec<NotificationChannel>,
} }
impl Notifier { impl Notifier {
@ -108,32 +115,32 @@ impl Notifier {
let mut notifiers = vec![]; let mut notifiers = vec![];
if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) { if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
notifiers.push(NotificationType::Discord(webhook)); notifiers.push(NotificationChannel::Discord(webhook));
} }
if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) { if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
notifiers.push(NotificationType::Slack(webhook)); notifiers.push(NotificationChannel::Slack(webhook));
} }
if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) { if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
notifiers.push(NotificationType::PagerDuty(routing_key)); notifiers.push(NotificationChannel::PagerDuty(routing_key));
} }
if let (Ok(bot_token), Ok(chat_id)) = ( if let (Ok(bot_token), Ok(chat_id)) = (
env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)), env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)), env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
) { ) {
notifiers.push(NotificationType::Telegram(TelegramWebHook { notifiers.push(NotificationChannel::Telegram(TelegramWebHook {
bot_token, bot_token,
chat_id, chat_id,
})); }));
} }
if let Ok(Some(webhook)) = get_twilio_config() { if let Ok(Some(webhook)) = get_twilio_config() {
notifiers.push(NotificationType::Twilio(webhook)); notifiers.push(NotificationChannel::Twilio(webhook));
} }
if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) { if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
match Level::from_str(&log_level) { match Level::from_str(&log_level) {
Ok(level) => notifiers.push(NotificationType::Log(level)), Ok(level) => notifiers.push(NotificationChannel::Log(level)),
Err(e) => warn!( Err(e) => warn!(
"could not parse specified log notifier level string ({}): {}", "could not parse specified log notifier level string ({}): {}",
log_level, e log_level, e
@ -153,10 +160,10 @@ impl Notifier {
self.notifiers.is_empty() self.notifiers.is_empty()
} }
pub fn send(&self, msg: &str) { pub fn send(&self, msg: &str, notification_type: &NotificationType) {
for notifier in &self.notifiers { for notifier in &self.notifiers {
match notifier { match notifier {
NotificationType::Discord(webhook) => { NotificationChannel::Discord(webhook) => {
for line in msg.split('\n') { for line in msg.split('\n') {
// Discord rate limiting is aggressive, limit to 1 message a second // Discord rate limiting is aggressive, limit to 1 message a second
sleep(Duration::from_millis(1000)); sleep(Duration::from_millis(1000));
@ -183,14 +190,23 @@ impl Notifier {
} }
} }
} }
NotificationType::Slack(webhook) => { NotificationChannel::Slack(webhook) => {
let data = json!({ "text": msg }); let data = json!({ "text": msg });
if let Err(err) = self.client.post(webhook).json(&data).send() { if let Err(err) = self.client.post(webhook).json(&data).send() {
warn!("Failed to send Slack message: {:?}", err); warn!("Failed to send Slack message: {:?}", err);
} }
} }
NotificationType::PagerDuty(routing_key) => { NotificationChannel::PagerDuty(routing_key) => {
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"}); let event_action = match notification_type {
NotificationType::Trigger { incident: _ } => String::from("trigger"),
NotificationType::Resolve { incident: _ } => String::from("resolve"),
};
let dedup_key = match notification_type {
NotificationType::Trigger { ref incident } => incident.clone().to_string(),
NotificationType::Resolve { ref incident } => incident.clone().to_string(),
};
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key});
let url = "https://events.pagerduty.com/v2/enqueue"; let url = "https://events.pagerduty.com/v2/enqueue";
if let Err(err) = self.client.post(url).json(&data).send() { if let Err(err) = self.client.post(url).json(&data).send() {
@ -198,7 +214,7 @@ impl Notifier {
} }
} }
NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => { NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => {
let data = json!({ "chat_id": chat_id, "text": msg }); let data = json!({ "chat_id": chat_id, "text": msg });
let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token); let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
@ -207,7 +223,7 @@ impl Notifier {
} }
} }
NotificationType::Twilio(TwilioWebHook { NotificationChannel::Twilio(TwilioWebHook {
account, account,
token, token,
to, to,
@ -222,7 +238,7 @@ impl Notifier {
warn!("Failed to send Twilio message: {:?}", err); warn!("Failed to send Twilio message: {:?}", err);
} }
} }
NotificationType::Log(level) => { NotificationChannel::Log(level) => {
log!(*level, "{}", msg) log!(*level, "{}", msg)
} }
} }

View File

@ -10,7 +10,7 @@ use {
}, },
solana_cli_output::display::format_labeled_address, solana_cli_output::display::format_labeled_address,
solana_metrics::{datapoint_error, datapoint_info}, solana_metrics::{datapoint_error, datapoint_info},
solana_notifier::Notifier, solana_notifier::{NotificationType, Notifier},
solana_rpc_client::rpc_client::RpcClient, solana_rpc_client::rpc_client::RpcClient,
solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus}, solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
solana_sdk::{ solana_sdk::{
@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
let mut last_notification_msg = "".into(); let mut last_notification_msg = "".into();
let mut num_consecutive_failures = 0; let mut num_consecutive_failures = 0;
let mut last_success = Instant::now(); let mut last_success = Instant::now();
let mut incident = Hash::new_unique();
loop { loop {
let failure = match get_cluster_info(&config, &rpc_client) { let failure = match get_cluster_info(&config, &rpc_client) {
@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
if num_consecutive_failures > config.unhealthy_threshold { if num_consecutive_failures > config.unhealthy_threshold {
datapoint_info!("watchtower-sanity", ("ok", false, bool)); datapoint_info!("watchtower-sanity", ("ok", false, bool));
if last_notification_msg != notification_msg { if last_notification_msg != notification_msg {
notifier.send(&notification_msg); notifier.send(&notification_msg, &NotificationType::Trigger { incident });
} }
datapoint_error!( datapoint_error!(
"watchtower-sanity-failure", "watchtower-sanity-failure",
@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> {
humantime::format_duration(alarm_duration) humantime::format_duration(alarm_duration)
); );
info!("{}", all_clear_msg); info!("{}", all_clear_msg);
notifier.send(&format!( notifier.send(
"solana-watchtower{}: {}", &format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg),
config.name_suffix, all_clear_msg &NotificationType::Resolve { incident },
)); );
} }
last_notification_msg = "".into(); last_notification_msg = "".into();
last_success = Instant::now(); last_success = Instant::now();
num_consecutive_failures = 0; num_consecutive_failures = 0;
incident = Hash::new_unique();
} }
sleep(config.interval); sleep(config.interval);
} }