2023-04-10 07:10:54 -07:00
#!/bin/bash -ex
#
2023-05-02 21:24:52 -07:00
# (Re)starts the InfluxDB services
2023-04-10 07:10:54 -07:00
#
cd " $( dirname " $0 " ) "
if [ [ -z $HOST ] ] ; then
HOST = metrics.solana.com
fi
echo " HOST: $HOST "
2023-04-16 17:42:25 -07:00
servers_data = ( "dev-equinix-washington-27" "dev-equinix-washington-28" "dev-equinix-washington-29" "dev-equinix-washington-30" "dev-equinix-washington-31" "dev-equinix-washington-32" "dev-equinix-amsterdam-20" "dev-equinix-amsterdam-21" "dev-equinix-amsterdam-22" "dev-equinix-chicago-17" "dev-equinix-chicago-19" "dev-equinix-chicago-25" "dev-equinix-amsterdam-19" "dev-equinix-dallas-1" "dev-equinix-frankfurt-1" "dev-equinix-toronto-5" )
servers_meta = ( "dev-equinix-washington-24" "dev-equinix-washington-25" "dev-equinix-washington-26" )
2023-04-10 07:10:54 -07:00
# Check the service on a list of servers
check_service( ) {
local service = $1
shift
local servers = ( " $@ " )
local message = ""
# Loop through the servers
for server in " ${ servers [@] } " ; do
2023-05-05 21:02:09 -07:00
local service_not_running = true
local retries = 3
for _ in $( seq 1 $retries ) ; do
2023-05-03 18:22:30 -07:00
# Check if the service is running
2023-05-05 21:02:09 -07:00
if ssh -o StrictHostKeyChecking = no sol@" $server " sudo systemctl is-active " $service " >/dev/null; then
# Service is running
message = " The $service service is running on $server . "
echo " $message "
service_not_running = false
break
2023-05-03 18:22:30 -07:00
else
2023-05-05 21:02:09 -07:00
# Service is not running, wait for 10 seconds and check again
sleep 10
2023-05-03 18:22:30 -07:00
fi
done
2023-05-05 21:02:09 -07:00
if $service_not_running ; then
# Service is not running, send alert and try to restart it
2023-05-02 21:24:52 -07:00
message = " The $service service is not running on $server . Restarting... "
echo " $message "
curl -H "Content-Type: application/json" -d '{"content":"' " $message " '"}' " $DISCORD_WEBHOOK "
2023-04-10 07:10:54 -07:00
ssh -o StrictHostKeyChecking = no sol@" $server " sudo systemctl restart " $service "
sleep 10 # Wait for the service to start
2023-05-02 21:24:52 -07:00
2023-04-10 07:10:54 -07:00
if ssh -o StrictHostKeyChecking = no sol@" $server " sudo systemctl is-active " $service " >/dev/null; then
# Service restarted successfully
message = " The $service service was restarted successfully on $server . "
2023-05-02 21:24:52 -07:00
echo " $message "
curl -H "Content-Type: application/json" -d '{"content":"' " $message " '"}' " $DISCORD_WEBHOOK "
else
# Service failed to restart
message = " ERROR: The $service service failed to restart on $server . "
echo " $message "
curl -H "Content-Type: application/json" -d '{"content":"' " $message " ', manual intervention is required."}' " $DISCORD_WEBHOOK "
curl -H "Content-Type: application/json" -d '{"routing_key":"<your-pagerduty-service-key>","event_action":"trigger","payload":{"summary":"The ' " $service " ' service failed to restart on ' " $server " '.","severity":"critical"}}' " $PAGERDUTY_WEBHOOK "
2023-04-10 07:10:54 -07:00
fi
2023-05-02 21:24:52 -07:00
fi
done
2023-04-10 07:10:54 -07:00
}
# Check the influxdb service
check_service "influxdb" " ${ servers_data [@] } "
# Check the influxdb-meta service
check_service "influxdb-meta" " ${ servers_meta [@] } "