Resolve PagerDuty incident on All Clear instead of triggering new incident (#28232)
This commit is contained in:
parent
c899ededfc
commit
459c9699b8
|
@ -5668,6 +5668,7 @@ dependencies = [
|
||||||
"log",
|
"log",
|
||||||
"reqwest",
|
"reqwest",
|
||||||
"serde_json",
|
"serde_json",
|
||||||
|
"solana-sdk 1.15.0",
|
||||||
]
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
|
|
|
@ -13,6 +13,7 @@ edition = "2021"
|
||||||
log = "0.4.17"
|
log = "0.4.17"
|
||||||
reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
|
reqwest = { version = "0.11.12", default-features = false, features = ["blocking", "brotli", "deflate", "gzip", "rustls-tls", "json"] }
|
||||||
serde_json = "1.0"
|
serde_json = "1.0"
|
||||||
|
solana-sdk = { path = "../sdk", version = "=1.15.0" }
|
||||||
|
|
||||||
[lib]
|
[lib]
|
||||||
name = "solana_notifier"
|
name = "solana_notifier"
|
||||||
|
|
|
@ -27,6 +27,7 @@ use log::*;
|
||||||
use {
|
use {
|
||||||
reqwest::{blocking::Client, StatusCode},
|
reqwest::{blocking::Client, StatusCode},
|
||||||
serde_json::json,
|
serde_json::json,
|
||||||
|
solana_sdk::hash::Hash,
|
||||||
std::{env, str::FromStr, thread::sleep, time::Duration},
|
std::{env, str::FromStr, thread::sleep, time::Duration},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
@ -83,7 +84,7 @@ fn get_twilio_config() -> Result<Option<TwilioWebHook>, String> {
|
||||||
Ok(Some(config))
|
Ok(Some(config))
|
||||||
}
|
}
|
||||||
|
|
||||||
enum NotificationType {
|
enum NotificationChannel {
|
||||||
Discord(String),
|
Discord(String),
|
||||||
Slack(String),
|
Slack(String),
|
||||||
PagerDuty(String),
|
PagerDuty(String),
|
||||||
|
@ -92,9 +93,15 @@ enum NotificationType {
|
||||||
Log(Level),
|
Log(Level),
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[derive(Clone)]
|
||||||
|
pub enum NotificationType {
|
||||||
|
Trigger { incident: Hash },
|
||||||
|
Resolve { incident: Hash },
|
||||||
|
}
|
||||||
|
|
||||||
pub struct Notifier {
|
pub struct Notifier {
|
||||||
client: Client,
|
client: Client,
|
||||||
notifiers: Vec<NotificationType>,
|
notifiers: Vec<NotificationChannel>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Notifier {
|
impl Notifier {
|
||||||
|
@ -108,32 +115,32 @@ impl Notifier {
|
||||||
let mut notifiers = vec![];
|
let mut notifiers = vec![];
|
||||||
|
|
||||||
if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
|
if let Ok(webhook) = env::var(format!("{}DISCORD_WEBHOOK", env_prefix)) {
|
||||||
notifiers.push(NotificationType::Discord(webhook));
|
notifiers.push(NotificationChannel::Discord(webhook));
|
||||||
}
|
}
|
||||||
if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
|
if let Ok(webhook) = env::var(format!("{}SLACK_WEBHOOK", env_prefix)) {
|
||||||
notifiers.push(NotificationType::Slack(webhook));
|
notifiers.push(NotificationChannel::Slack(webhook));
|
||||||
}
|
}
|
||||||
if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
|
if let Ok(routing_key) = env::var(format!("{}PAGERDUTY_INTEGRATION_KEY", env_prefix)) {
|
||||||
notifiers.push(NotificationType::PagerDuty(routing_key));
|
notifiers.push(NotificationChannel::PagerDuty(routing_key));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let (Ok(bot_token), Ok(chat_id)) = (
|
if let (Ok(bot_token), Ok(chat_id)) = (
|
||||||
env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
|
env::var(format!("{}TELEGRAM_BOT_TOKEN", env_prefix)),
|
||||||
env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
|
env::var(format!("{}TELEGRAM_CHAT_ID", env_prefix)),
|
||||||
) {
|
) {
|
||||||
notifiers.push(NotificationType::Telegram(TelegramWebHook {
|
notifiers.push(NotificationChannel::Telegram(TelegramWebHook {
|
||||||
bot_token,
|
bot_token,
|
||||||
chat_id,
|
chat_id,
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(Some(webhook)) = get_twilio_config() {
|
if let Ok(Some(webhook)) = get_twilio_config() {
|
||||||
notifiers.push(NotificationType::Twilio(webhook));
|
notifiers.push(NotificationChannel::Twilio(webhook));
|
||||||
}
|
}
|
||||||
|
|
||||||
if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
|
if let Ok(log_level) = env::var(format!("{}LOG_NOTIFIER_LEVEL", env_prefix)) {
|
||||||
match Level::from_str(&log_level) {
|
match Level::from_str(&log_level) {
|
||||||
Ok(level) => notifiers.push(NotificationType::Log(level)),
|
Ok(level) => notifiers.push(NotificationChannel::Log(level)),
|
||||||
Err(e) => warn!(
|
Err(e) => warn!(
|
||||||
"could not parse specified log notifier level string ({}): {}",
|
"could not parse specified log notifier level string ({}): {}",
|
||||||
log_level, e
|
log_level, e
|
||||||
|
@ -153,10 +160,10 @@ impl Notifier {
|
||||||
self.notifiers.is_empty()
|
self.notifiers.is_empty()
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn send(&self, msg: &str) {
|
pub fn send(&self, msg: &str, notification_type: &NotificationType) {
|
||||||
for notifier in &self.notifiers {
|
for notifier in &self.notifiers {
|
||||||
match notifier {
|
match notifier {
|
||||||
NotificationType::Discord(webhook) => {
|
NotificationChannel::Discord(webhook) => {
|
||||||
for line in msg.split('\n') {
|
for line in msg.split('\n') {
|
||||||
// Discord rate limiting is aggressive, limit to 1 message a second
|
// Discord rate limiting is aggressive, limit to 1 message a second
|
||||||
sleep(Duration::from_millis(1000));
|
sleep(Duration::from_millis(1000));
|
||||||
|
@ -183,14 +190,23 @@ impl Notifier {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NotificationType::Slack(webhook) => {
|
NotificationChannel::Slack(webhook) => {
|
||||||
let data = json!({ "text": msg });
|
let data = json!({ "text": msg });
|
||||||
if let Err(err) = self.client.post(webhook).json(&data).send() {
|
if let Err(err) = self.client.post(webhook).json(&data).send() {
|
||||||
warn!("Failed to send Slack message: {:?}", err);
|
warn!("Failed to send Slack message: {:?}", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NotificationType::PagerDuty(routing_key) => {
|
NotificationChannel::PagerDuty(routing_key) => {
|
||||||
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":"trigger"});
|
let event_action = match notification_type {
|
||||||
|
NotificationType::Trigger { incident: _ } => String::from("trigger"),
|
||||||
|
NotificationType::Resolve { incident: _ } => String::from("resolve"),
|
||||||
|
};
|
||||||
|
let dedup_key = match notification_type {
|
||||||
|
NotificationType::Trigger { ref incident } => incident.clone().to_string(),
|
||||||
|
NotificationType::Resolve { ref incident } => incident.clone().to_string(),
|
||||||
|
};
|
||||||
|
|
||||||
|
let data = json!({"payload":{"summary":msg,"source":"solana-watchtower","severity":"critical"},"routing_key":routing_key,"event_action":event_action,"dedup_key":dedup_key});
|
||||||
let url = "https://events.pagerduty.com/v2/enqueue";
|
let url = "https://events.pagerduty.com/v2/enqueue";
|
||||||
|
|
||||||
if let Err(err) = self.client.post(url).json(&data).send() {
|
if let Err(err) = self.client.post(url).json(&data).send() {
|
||||||
|
@ -198,7 +214,7 @@ impl Notifier {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NotificationType::Telegram(TelegramWebHook { chat_id, bot_token }) => {
|
NotificationChannel::Telegram(TelegramWebHook { chat_id, bot_token }) => {
|
||||||
let data = json!({ "chat_id": chat_id, "text": msg });
|
let data = json!({ "chat_id": chat_id, "text": msg });
|
||||||
let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
|
let url = format!("https://api.telegram.org/bot{}/sendMessage", bot_token);
|
||||||
|
|
||||||
|
@ -207,7 +223,7 @@ impl Notifier {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
NotificationType::Twilio(TwilioWebHook {
|
NotificationChannel::Twilio(TwilioWebHook {
|
||||||
account,
|
account,
|
||||||
token,
|
token,
|
||||||
to,
|
to,
|
||||||
|
@ -222,7 +238,7 @@ impl Notifier {
|
||||||
warn!("Failed to send Twilio message: {:?}", err);
|
warn!("Failed to send Twilio message: {:?}", err);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
NotificationType::Log(level) => {
|
NotificationChannel::Log(level) => {
|
||||||
log!(*level, "{}", msg)
|
log!(*level, "{}", msg)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -10,7 +10,7 @@ use {
|
||||||
},
|
},
|
||||||
solana_cli_output::display::format_labeled_address,
|
solana_cli_output::display::format_labeled_address,
|
||||||
solana_metrics::{datapoint_error, datapoint_info},
|
solana_metrics::{datapoint_error, datapoint_info},
|
||||||
solana_notifier::Notifier,
|
solana_notifier::{NotificationType, Notifier},
|
||||||
solana_rpc_client::rpc_client::RpcClient,
|
solana_rpc_client::rpc_client::RpcClient,
|
||||||
solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
|
solana_rpc_client_api::{client_error, response::RpcVoteAccountStatus},
|
||||||
solana_sdk::{
|
solana_sdk::{
|
||||||
|
@ -244,6 +244,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
||||||
let mut last_notification_msg = "".into();
|
let mut last_notification_msg = "".into();
|
||||||
let mut num_consecutive_failures = 0;
|
let mut num_consecutive_failures = 0;
|
||||||
let mut last_success = Instant::now();
|
let mut last_success = Instant::now();
|
||||||
|
let mut incident = Hash::new_unique();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let failure = match get_cluster_info(&config, &rpc_client) {
|
let failure = match get_cluster_info(&config, &rpc_client) {
|
||||||
|
@ -373,7 +374,7 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
||||||
if num_consecutive_failures > config.unhealthy_threshold {
|
if num_consecutive_failures > config.unhealthy_threshold {
|
||||||
datapoint_info!("watchtower-sanity", ("ok", false, bool));
|
datapoint_info!("watchtower-sanity", ("ok", false, bool));
|
||||||
if last_notification_msg != notification_msg {
|
if last_notification_msg != notification_msg {
|
||||||
notifier.send(¬ification_msg);
|
notifier.send(¬ification_msg, &NotificationType::Trigger { incident });
|
||||||
}
|
}
|
||||||
datapoint_error!(
|
datapoint_error!(
|
||||||
"watchtower-sanity-failure",
|
"watchtower-sanity-failure",
|
||||||
|
@ -399,14 +400,15 @@ fn main() -> Result<(), Box<dyn error::Error>> {
|
||||||
humantime::format_duration(alarm_duration)
|
humantime::format_duration(alarm_duration)
|
||||||
);
|
);
|
||||||
info!("{}", all_clear_msg);
|
info!("{}", all_clear_msg);
|
||||||
notifier.send(&format!(
|
notifier.send(
|
||||||
"solana-watchtower{}: {}",
|
&format!("solana-watchtower{}: {}", config.name_suffix, all_clear_msg),
|
||||||
config.name_suffix, all_clear_msg
|
&NotificationType::Resolve { incident },
|
||||||
));
|
);
|
||||||
}
|
}
|
||||||
last_notification_msg = "".into();
|
last_notification_msg = "".into();
|
||||||
last_success = Instant::now();
|
last_success = Instant::now();
|
||||||
num_consecutive_failures = 0;
|
num_consecutive_failures = 0;
|
||||||
|
incident = Hash::new_unique();
|
||||||
}
|
}
|
||||||
sleep(config.interval);
|
sleep(config.interval);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue