refactor container status check (#30998)

* refactor container status check

* remove blank line at EOF

* add pagerduty integration

Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com>

* fix discord webhook reference

* remove webhook references

---------

Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com>
This commit is contained in:
joeaba 2023-03-30 22:35:21 -05:00 committed by GitHub
parent a1149ecafe
commit eedb92a6c0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 37 additions and 29 deletions

2
metrics/metrics-main/alertmanager.sh Normal file → Executable file
View File

@ -39,6 +39,6 @@ sudo docker run -it -d \
--user root:root \ --user root:root \
--publish 9093:9093 \ --publish 9093:9093 \
--name=alertmanager \ --name=alertmanager \
--volume "PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \ --volume "$PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
--volume /etc/hosts:/etc/hosts \ --volume /etc/hosts:/etc/hosts \
$ALERTMANAGER_IMAGE $ALERTMANAGER_IMAGE

64
metrics/metrics-main/status.sh Normal file → Executable file
View File

@ -1,32 +1,40 @@
#!/bin/bash -ex #!/bin/bash
#
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
#
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then # List of containers
HOST=metrics.solana.com containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")
fi
echo "HOST: $HOST"
echo +++ status # Send a message to Discord
( send_discord_message() {
set -x local message="$1"
pwd curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
sudo docker ps --no-trunc --size }
df -h
free -h
uptime
)
# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again # Send a critical alert to PagerDuty
send_pagerduty_alert() {
local description="$1"
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
}
for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do # Iterate over the containers and check their status
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then for container in "${containers[@]}"; do
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK" container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK"
echo "Starting up script" if [ "$container_status" != "running" ]; then
sudo bash $container.sh send_discord_message "$container is down and it's being redeployed..."
sleep 30
fi # Run the container.sh script to redeploy the container
done chmod +x "$container.sh"
./"$container.sh"
sleep 10
# Check the container status again
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
if [ "$container_status" != "running" ]; then
send_discord_message "$container failed to redeploy and manual intervention is required"
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
else
send_discord_message "$container has been redeployed successfully"
fi
fi
done