Resolve PagerDuty incident on All Clear instead of triggering new incident (#28232)

This commit is contained in:
Michael 2022-10-05 20:55:45 +02:00 committed by GitHub
parent c899ededfc
commit 459c9699b8
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 42 additions and 22 deletions

1
Cargo.lock generated
View File

@ -5668,6 +5668,7 @@ dependencies = [
"log",
"reqwest",
"serde_json",
"solana-sdk 1.15.0",
]
[[package]]

View File

@ -13,6 +13,7 @@ edition = "2021"
log = "0.4.17"
reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
serde_json = "1.0"
solana-sdk = { path = "../sdk", version = "=1.15.0" }
[lib]
name = "solana_notifier"

View File

@ -27,6 +27,7 @@ use log::*;
use {
reqwest::{blocking::Client, StatusCode},
serde_json::json,
solana_sdk::hash::Hash,
std::{env, str::FromStr, thread::sleep, time::Duration},
};
@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
Ok(Some(config))
}
enum NotificationType {
enum NotificationChannel {
Discord(String),
Slack(String),
PagerDuty(String),
@ -92,9 +93,15 @@ enum NotificationType {
Log(Level),
}
#[derive(Clone)]
pub enum NotificationType {
Trigger { incident: Hash },
Resolve { incident: Hash },
}
pub struct Notifier {
client: Client,
notifiers: Vec<NotificationType>,
notifiers: Vec<NotificationChannel>,
}
impl Notifier {
@ -108,32 +115,32 @@ impl Notifier {
let mut notifiers = vec![];
if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
notifiers.push(NotificationType::Discord(webhook));
notifiers.push(NotificationChannel::Discord(webhook));
}
if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
notifiers.push(NotificationType::Slack(webhook));
notifiers.push(NotificationChannel::Slack(webhook));
}
if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
notifiers.push(NotificationType::PagerDuty(routing_key));
notifiers.push(NotificationChannel::PagerDuty(routing_key));
}
if let (Ok(bot_token), Ok(chat_id)) = (
env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
) {
notifiers.push(NotificationType::Telegram(TelegramWebHook {
notifiers.push(NotificationChannel::Telegram(TelegramWebHook {
bot_token,
chat_id,
}));
}
if let Ok(Some(webhook)) = get_twilio_config() {
notifiers.push(NotificationType::Twilio(webhook));
notifiers.push(NotificationChannel::Twilio(webhook));
}
if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
match Level::from_str(&log_level) {
Ok(level) => notifiers.push(NotificationType::Log(level)),
Ok(level) => notifiers.push(NotificationChannel::Log(level)),
Err(e) => warn!(
"could not parse specified log notifier level string ({}): {}",
log_level, e
@ -153,10 +160,10 @@ impl Notifier {
self.notifiers.is_empty()
}
pub fn send(&self, msg: &str) {
pub fn send(&self, msg: &str, notification_type: &NotificationType) {
for notifier in &self.notifiers {
match notifier {
NotificationType::Discord(webhook) => {
NotificationChannel::Discord(webhook) => {
for line in msg.split('\n') {
// Discord rate limiting is aggressive, limit to 1 message a second
sleep(Duration::from_millis(1000));
@ -183,14 +190,23 @@ impl Notifier {
}
}
}
NotificationType::Slack(webhook) => {
NotificationChannel::Slack(webhook) => {
let data = json!({ "text": msg });
if let Err(err) = self.client.post(webhook).json(&data).send() {
warn!("Failed to send Slack message: {:?}", err);
}
}
NotificationType::PagerDuty(routing_key) => {
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"});
NotificationChannel::PagerDuty(routing_key) => {
let event_action = match notification_type {
NotificationType::Trigger { incident: _ } => String::from("trigger"),
NotificationType::Resolve { incident: _ } => String::from("resolve"),
};
let dedup_key = match notification_type {
NotificationType::Trigger { ref incident } => incident.clone().to_string(),
NotificationType::Resolve { ref incident } => incident.clone().to_string(),
};
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key});
let url = "https://events.pagerduty.com/v2/enqueue";
if let Err(err) = self.client.post(url).json(&data).send() {
@ -198,7 +214,7 @@ impl Notifier {
}
}
NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => {
NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => {
let data = json!({ "chat_id": chat_id, "text": msg });
let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
@ -207,7 +223,7 @@ impl Notifier {
}
}
NotificationType::Twilio(TwilioWebHook {
NotificationChannel::Twilio(TwilioWebHook {
account,
token,
to,
@ -222,7 +238,7 @@ impl Notifier {
warn!("Failed to send Twilio message: {:?}", err);
}
}
NotificationType::Log(level) => {
NotificationChannel::Log(level) => {
log!(*level, "{}", msg)
}
}

View File

@ -10,7 +10,7 @@ use {
},
solana_cli_output::display::format_labeled_address,
solana_metrics::{datapoint_error, datapoint_info},
solana_notifier::Notifier,
solana_notifier::{NotificationType, Notifier},
solana_rpc_client::rpc_client::RpcClient,
solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
solana_sdk::{
@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
let mut last_notification_msg = "".into();
let mut num_consecutive_failures = 0;
let mut last_success = Instant::now();
let mut incident = Hash::new_unique();
loop {
let failure = match get_cluster_info(&config, &rpc_client) {
@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
if num_consecutive_failures > config.unhealthy_threshold {
datapoint_info!("watchtower-sanity", ("ok", false, bool));
if last_notification_msg != notification_msg {
notifier.send(&notification_msg);
notifier.send(&notification_msg, &NotificationType::Trigger { incident });
}
datapoint_error!(
"watchtower-sanity-failure",
@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> {
humantime::format_duration(alarm_duration)
);
info!("{}", all_clear_msg);
notifier.send(&format!(
"solana-watchtower{}: {}",
config.name_suffix, all_clear_msg
));
notifier.send(
&format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg),
&NotificationType::Resolve { incident },
);
}
last_notification_msg = "".into();
last_success = Instant::now();
num_consecutive_failures = 0;
incident = Hash::new_unique();
}
sleep(config.interval);
}