From eedb92a6c0e2df0c3bdb8e39d954f41d5154216d Mon Sep 17 00:00:00 2001 From: joeaba <77398477+joeaba@users.noreply.github.com> Date: Thu, 30 Mar 2023 22:35:21 -0500 Subject: [PATCH] refactor container status check (#30998) * refactor container status check * remove blank line at EOF * add pagerduty integration Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com> * fix discord webhook reference * remove webhook references --------- Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com> --- metrics/metrics-main/alertmanager.sh | 2 +- metrics/metrics-main/status.sh | 64 ++++++++++++++++------------ 2 files changed, 37 insertions(+), 29 deletions(-) mode change 100644 => 100755 metrics/metrics-main/alertmanager.sh mode change 100644 => 100755 metrics/metrics-main/status.sh diff --git a/metrics/metrics-main/alertmanager.sh b/metrics/metrics-main/alertmanager.sh old mode 100644 new mode 100755 index 0eba86297..5289f9a5a --- a/metrics/metrics-main/alertmanager.sh +++ b/metrics/metrics-main/alertmanager.sh @@ -39,6 +39,6 @@ sudo docker run -it -d \ --user root:root \ --publish 9093:9093 \ --name=alertmanager \ - --volume "PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ + --volume "$PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ --volume /etc/hosts:/etc/hosts \ $ALERTMANAGER_IMAGE diff --git a/metrics/metrics-main/status.sh b/metrics/metrics-main/status.sh old mode 100644 new mode 100755 index 58ccea30f..e1019e903 --- a/metrics/metrics-main/status.sh +++ b/metrics/metrics-main/status.sh @@ -1,32 +1,40 @@ -#!/bin/bash -ex -# -# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers -# -cd "$(dirname "$0")" +#!/bin/bash -if [[ -z $HOST ]]; then - HOST=metrics.solana.com -fi -echo "HOST: $HOST" +# List of containers +containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor") -echo +++ status -( - set -x - pwd - sudo docker ps --no-trunc --size - df -h - free -h - uptime -) +# Send a message to Discord +send_discord_message() { + local message="$1" + curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK" +} -# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again +# Send a critical alert to PagerDuty +send_pagerduty_alert() { + local description="$1" + curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK" +} -for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do - if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then - curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK" - curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK" - echo "Starting up script" - sudo bash $container.sh - sleep 30 - fi - done +# Iterate over the containers and check their status +for container in "${containers[@]}"; do + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) + + if [ "$container_status" != "running" ]; then + send_discord_message "$container is down and it's being redeployed..." + + # Run the container.sh script to redeploy the container + chmod +x "$container.sh" + ./"$container.sh" + sleep 10 + + # Check the container status again + container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null) + + if [ "$container_status" != "running" ]; then + send_discord_message "$container failed to redeploy and manual intervention is required" + send_pagerduty_alert "$container failed to redeploy and manual intervention is required." + else + send_discord_message "$container has been redeployed successfully" + fi + fi +done