diff --git a/metrics/influx-enterprise/status.sh b/metrics/influx-enterprise/status.sh index fa9c2fed7..62fb2a445 100755 --- a/metrics/influx-enterprise/status.sh +++ b/metrics/influx-enterprise/status.sh @@ -22,13 +22,32 @@ check_service() { # Loop through the servers for server in "${servers[@]}"; do - # Check if the service is running - if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null; then + ssh_success=false + ssh_attempts=0 + while ! $ssh_success && [ $ssh_attempts -lt 3 ]; do + # Check if the service is running + if ssh -o StrictHostKeyChecking=no sol@"$server" sudo systemctl is-active "$service" >/dev/null 2>&1; then + ssh_success=true + else + ssh_attempts=$((ssh_attempts + 1)) + sleep 5 + fi + done + + if $ssh_success; then # Service is running message="The $service service is running on $server." echo "$message" else - # Service is not running, try to restart it + # SSH connection failed after retries + message="ERROR: Unable to establish SSH connection to $server after 3 retries." + echo "$message" + curl -H "Content-Type: application/json" -d '{"content":"'"$message"', manual intervention is required."}' "$DISCORD_WEBHOOK" + continue + fi + + # Service is not running, try to restart it + if ! $ssh_success; then message="The $service service is not running on $server. Restarting..." echo "$message" curl -H "Content-Type: application/json" -d '{"content":"'"$message"'"}' "$DISCORD_WEBHOOK"