refactor container status check (#30998)
* refactor container status check * remove blank line at EOF * add pagerduty integration Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com> * fix discord webhook reference * remove webhook references --------- Co-authored-by: axleiro <83293196+axleiro@users.noreply.github.com>
This commit is contained in:
parent
a1149ecafe
commit
eedb92a6c0
|
@ -39,6 +39,6 @@ sudo docker run -it -d \
|
|||
--user root:root \
|
||||
--publish 9093:9093 \
|
||||
--name=alertmanager \
|
||||
--volume "PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
|
||||
--volume "$PWD"/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
|
||||
--volume /etc/hosts:/etc/hosts \
|
||||
$ALERTMANAGER_IMAGE
|
||||
|
|
|
@ -1,32 +1,40 @@
|
|||
#!/bin/bash -ex
|
||||
#
|
||||
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
|
||||
#
|
||||
cd "$(dirname "$0")"
|
||||
#!/bin/bash
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
# List of containers
|
||||
containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")
|
||||
|
||||
echo +++ status
|
||||
(
|
||||
set -x
|
||||
pwd
|
||||
sudo docker ps --no-trunc --size
|
||||
df -h
|
||||
free -h
|
||||
uptime
|
||||
)
|
||||
# Send a message to Discord
|
||||
send_discord_message() {
|
||||
local message="$1"
|
||||
curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
|
||||
}
|
||||
|
||||
# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again
|
||||
# Send a critical alert to PagerDuty
|
||||
send_pagerduty_alert() {
|
||||
local description="$1"
|
||||
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
|
||||
}
|
||||
|
||||
for container in chronograf_8889 grafana alertmanager alertmanager-discord prometheus chronograf kapacitor ; do
|
||||
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then
|
||||
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$SLACK_WEBHOOK"
|
||||
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in the metrics-mainsystem server. Restarting..."}' "$DISCORD_WEBHOOK"
|
||||
echo "Starting up script"
|
||||
sudo bash $container.sh
|
||||
sleep 30
|
||||
# Iterate over the containers and check their status
|
||||
for container in "${containers[@]}"; do
|
||||
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
|
||||
|
||||
if [ "$container_status" != "running" ]; then
|
||||
send_discord_message "$container is down and it's being redeployed..."
|
||||
|
||||
# Run the container.sh script to redeploy the container
|
||||
chmod +x "$container.sh"
|
||||
./"$container.sh"
|
||||
sleep 10
|
||||
|
||||
# Check the container status again
|
||||
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
|
||||
|
||||
if [ "$container_status" != "running" ]; then
|
||||
send_discord_message "$container failed to redeploy and manual intervention is required"
|
||||
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
|
||||
else
|
||||
send_discord_message "$container has been redeployed successfully"
|
||||
fi
|
||||
done
|
||||
fi
|
||||
done
|
||||
|
|
Loading…
Reference in New Issue