Resolve PagerDuty incident on All Clear instead of triggering new incident (#28232)
This commit is contained in:
parent
c899ededfc
commit
459c9699b8
|
@ -5668,6 +5668,7 @@ dependencies = [
|
|||
"log",
|
||||
"reqwest",
|
||||
"serde_json",
|
||||
"solana-sdk 1.15.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
|
@ -13,6 +13,7 @@ edition = "2021"
|
|||
log = "0.4.17"
|
||||
reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
|
||||
serde_json = "1.0"
|
||||
solana-sdk = { path = "../sdk", version = "=1.15.0" }
|
||||
|
||||
[lib]
|
||||
name = "solana_notifier"
|
||||
|
|
|
@ -27,6 +27,7 @@ use log::*;
|
|||
use {
|
||||
reqwest::{blocking::Client, StatusCode},
|
||||
serde_json::json,
|
||||
solana_sdk::hash::Hash,
|
||||
std::{env, str::FromStr, thread::sleep, time::Duration},
|
||||
};
|
||||
|
||||
|
@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
|
|||
Ok(Some(config))
|
||||
}
|
||||
|
||||
enum NotificationType {
|
||||
enum NotificationChannel {
|
||||
Discord(String),
|
||||
Slack(String),
|
||||
PagerDuty(String),
|
||||
|
@ -92,9 +93,15 @@ enum NotificationType {
|
|||
Log(Level),
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
pub enum NotificationType {
|
||||
Trigger { incident: Hash },
|
||||
Resolve { incident: Hash },
|
||||
}
|
||||
|
||||
pub struct Notifier {
|
||||
client: Client,
|
||||
notifiers: Vec<NotificationType>,
|
||||
notifiers: Vec<NotificationChannel>,
|
||||
}
|
||||
|
||||
impl Notifier {
|
||||
|
@ -108,32 +115,32 @@ impl Notifier {
|
|||
let mut notifiers = vec![];
|
||||
|
||||
if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
|
||||
notifiers.push(NotificationType::Discord(webhook));
|
||||
notifiers.push(NotificationChannel::Discord(webhook));
|
||||
}
|
||||
if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
|
||||
notifiers.push(NotificationType::Slack(webhook));
|
||||
notifiers.push(NotificationChannel::Slack(webhook));
|
||||
}
|
||||
if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
|
||||
notifiers.push(NotificationType::PagerDuty(routing_key));
|
||||
notifiers.push(NotificationChannel::PagerDuty(routing_key));
|
||||
}
|
||||
|
||||
if let (Ok(bot_token), Ok(chat_id)) = (
|
||||
env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
|
||||
env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
|
||||
) {
|
||||
notifiers.push(NotificationType::Telegram(TelegramWebHook {
|
||||
notifiers.push(NotificationChannel::Telegram(TelegramWebHook {
|
||||
bot_token,
|
||||
chat_id,
|
||||
}));
|
||||
}
|
||||
|
||||
if let Ok(Some(webhook)) = get_twilio_config() {
|
||||
notifiers.push(NotificationType::Twilio(webhook));
|
||||
notifiers.push(NotificationChannel::Twilio(webhook));
|
||||
}
|
||||
|
||||
if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
|
||||
match Level::from_str(&log_level) {
|
||||
Ok(level) => notifiers.push(NotificationType::Log(level)),
|
||||
Ok(level) => notifiers.push(NotificationChannel::Log(level)),
|
||||
Err(e) => warn!(
|
||||
"could not parse specified log notifier level string ({}): {}",
|
||||
log_level, e
|
||||
|
@ -153,10 +160,10 @@ impl Notifier {
|
|||
self.notifiers.is_empty()
|
||||
}
|
||||
|
||||
pub fn send(&self, msg: &str) {
|
||||
pub fn send(&self, msg: &str, notification_type: &NotificationType) {
|
||||
for notifier in &self.notifiers {
|
||||
match notifier {
|
||||
NotificationType::Discord(webhook) => {
|
||||
NotificationChannel::Discord(webhook) => {
|
||||
for line in msg.split('\n') {
|
||||
// Discord rate limiting is aggressive, limit to 1 message a second
|
||||
sleep(Duration::from_millis(1000));
|
||||
|
@ -183,14 +190,23 @@ impl Notifier {
|
|||
}
|
||||
}
|
||||
}
|
||||
NotificationType::Slack(webhook) => {
|
||||
NotificationChannel::Slack(webhook) => {
|
||||
let data = json!({ "text": msg });
|
||||
if let Err(err) = self.client.post(webhook).json(&data).send() {
|
||||
warn!("Failed to send Slack message: {:?}", err);
|
||||
}
|
||||
}
|
||||
NotificationType::PagerDuty(routing_key) => {
|
||||
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"});
|
||||
NotificationChannel::PagerDuty(routing_key) => {
|
||||
let event_action = match notification_type {
|
||||
NotificationType::Trigger { incident: _ } => String::from("trigger"),
|
||||
NotificationType::Resolve { incident: _ } => String::from("resolve"),
|
||||
};
|
||||
let dedup_key = match notification_type {
|
||||
NotificationType::Trigger { ref incident } => incident.clone().to_string(),
|
||||
NotificationType::Resolve { ref incident } => incident.clone().to_string(),
|
||||
};
|
||||
|
||||
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key});
|
||||
let url = "https://events.pagerduty.com/v2/enqueue";
|
||||
|
||||
if let Err(err) = self.client.post(url).json(&data).send() {
|
||||
|
@ -198,7 +214,7 @@ impl Notifier {
|
|||
}
|
||||
}
|
||||
|
||||
NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => {
|
||||
NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => {
|
||||
let data = json!({ "chat_id": chat_id, "text": msg });
|
||||
let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
|
||||
|
||||
|
@ -207,7 +223,7 @@ impl Notifier {
|
|||
}
|
||||
}
|
||||
|
||||
NotificationType::Twilio(TwilioWebHook {
|
||||
NotificationChannel::Twilio(TwilioWebHook {
|
||||
account,
|
||||
token,
|
||||
to,
|
||||
|
@ -222,7 +238,7 @@ impl Notifier {
|
|||
warn!("Failed to send Twilio message: {:?}", err);
|
||||
}
|
||||
}
|
||||
NotificationType::Log(level) => {
|
||||
NotificationChannel::Log(level) => {
|
||||
log!(*level, "{}", msg)
|
||||
}
|
||||
}
|
||||
|
|
|
@ -10,7 +10,7 @@ use {
|
|||
},
|
||||
solana_cli_output::display::format_labeled_address,
|
||||
solana_metrics::{datapoint_error, datapoint_info},
|
||||
solana_notifier::Notifier,
|
||||
solana_notifier::{NotificationType, Notifier},
|
||||
solana_rpc_client::rpc_client::RpcClient,
|
||||
solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
|
||||
solana_sdk::{
|
||||
|
@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
|||
let mut last_notification_msg = "".into();
|
||||
let mut num_consecutive_failures = 0;
|
||||
let mut last_success = Instant::now();
|
||||
let mut incident = Hash::new_unique();
|
||||
|
||||
loop {
|
||||
let failure = match get_cluster_info(&config, &rpc_client) {
|
||||
|
@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
|||
if num_consecutive_failures > config.unhealthy_threshold {
|
||||
datapoint_info!("watchtower-sanity", ("ok", false, bool));
|
||||
if last_notification_msg != notification_msg {
|
||||
notifier.send(¬ification_msg);
|
||||
notifier.send(¬ification_msg, &NotificationType::Trigger { incident });
|
||||
}
|
||||
datapoint_error!(
|
||||
"watchtower-sanity-failure",
|
||||
|
@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
|||
humantime::format_duration(alarm_duration)
|
||||
);
|
||||
info!("{}", all_clear_msg);
|
||||
notifier.send(&format!(
|
||||
"solana-watchtower{}: {}",
|
||||
config.name_suffix, all_clear_msg
|
||||
));
|
||||
notifier.send(
|
||||
&format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg),
|
||||
&NotificationType::Resolve { incident },
|
||||
);
|
||||
}
|
||||
last_notification_msg = "".into();
|
||||
last_success = Instant::now();
|
||||
num_consecutive_failures = 0;
|
||||
incident = Hash::new_unique();
|
||||
}
|
||||
sleep(config.interval);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue