update metrics status scripts (#31037)
* update metrics status scripts * add exit condition
This commit is contained in:
parent
b0540ff5ad
commit
80b25726e1
|
@ -5,7 +5,7 @@
|
|||
cd "$(dirname "$0")"
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
|
@ -34,7 +34,7 @@ sudo chown buildkite-agent:buildkite-agent certs
|
|||
|
||||
|
||||
# (Re)start the container
|
||||
sudo sudo docker run \
|
||||
sudo docker run \
|
||||
--detach \
|
||||
--env AUTH_DURATION=24h \
|
||||
--env TLS_CERTIFICATE=/certs/fullchain.pem \
|
||||
|
@ -53,4 +53,4 @@ sudo sudo docker run \
|
|||
--volume /var/lib/chronograf:/var/lib/chronograf \
|
||||
--log-opt max-size=1g \
|
||||
--log-opt max-file="5" \
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
cd "$(dirname "$0")"
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
|
@ -43,7 +43,7 @@ sudo docker run \
|
|||
--env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \
|
||||
--env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \
|
||||
--env PUBLIC_URL=https://internal-metrics.solana.com:8889 \
|
||||
--env TOKEN_SECRET= \
|
||||
--env TOKEN_SECRET="$TOKEN_SECRET" \
|
||||
--env inactivity-duration=48h \
|
||||
--name=chronograf_8889_internal \
|
||||
--net=influxdb \
|
||||
|
@ -53,4 +53,4 @@ sudo docker run \
|
|||
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
|
||||
--log-opt max-size=1g \
|
||||
--log-opt max-file="5" \
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
|
||||
|
|
|
@ -2,10 +2,11 @@
|
|||
#
|
||||
# (Re)starts the Grafana containers
|
||||
#
|
||||
|
||||
cd "$(dirname "$0")"
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
|
@ -32,7 +33,6 @@ sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/
|
|||
sudo chmod 0444 certs/*
|
||||
sudo chown buildkite-agent:buildkite-agent certs
|
||||
|
||||
|
||||
#(Re)start the container
|
||||
sudo docker run \
|
||||
--detach \
|
||||
|
@ -41,6 +41,10 @@ sudo docker run \
|
|||
--publish 3000:3000 \
|
||||
--user root:root \
|
||||
--env GF_PATHS_CONFIG=/grafana.ini \
|
||||
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
|
||||
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
|
||||
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
|
||||
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
|
||||
--volume "$PWD"/certs:/certs:ro \
|
||||
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
|
||||
--volume /var/lib/grafana:/var/lib/grafana \
|
||||
|
|
|
@ -5,7 +5,7 @@
|
|||
cd "$(dirname "$0")"
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
|
@ -39,6 +39,8 @@ sudo docker run \
|
|||
--net=influxdb \
|
||||
--publish 8086:8086 \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
|
||||
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
|
||||
--volume "$PWD"/certs:/certs \
|
||||
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
|
||||
--volume /var/lib/influxdb:/var/lib/influxdb \
|
||||
|
|
|
@ -6,7 +6,7 @@
|
|||
cd "$(dirname "$0")"
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
|
@ -48,6 +48,10 @@ sudo docker run \
|
|||
--publish 3000:3000 \
|
||||
--user root:root \
|
||||
--env GF_PATHS_CONFIG=/grafana.ini \
|
||||
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
|
||||
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
|
||||
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
|
||||
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
|
||||
--volume "$PWD"/certs:/certs:ro \
|
||||
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
|
||||
--volume /var/lib/grafana:/var/lib/grafana \
|
||||
|
@ -61,6 +65,8 @@ sudo docker run \
|
|||
--net=influxdb \
|
||||
--publish 8086:8086 \
|
||||
--user "$(id -u):$(id -g)" \
|
||||
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
|
||||
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
|
||||
--volume "$PWD"/certs:/certs \
|
||||
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
|
||||
--volume /var/lib/influxdb:/var/lib/influxdb \
|
||||
|
@ -90,9 +96,9 @@ sudo docker run \
|
|||
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
|
||||
--log-opt max-size=1g \
|
||||
--log-opt max-file="5" \
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
|
||||
|
||||
sudo sudo docker run \
|
||||
sudo docker run \
|
||||
--detach \
|
||||
--env AUTH_DURATION=24h \
|
||||
--env TLS_CERTIFICATE=/certs/fullchain.pem \
|
||||
|
@ -111,7 +117,7 @@ sudo sudo docker run \
|
|||
--volume /var/lib/chronograf:/var/lib/chronograf \
|
||||
--log-opt max-size=1g \
|
||||
--log-opt max-file="5" \
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
|
||||
|
||||
curl -h | sed -ne '/--tlsv/p'
|
||||
curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping
|
||||
|
|
|
@ -1,33 +1,47 @@
|
|||
#!/bin/bash -ex
|
||||
#
|
||||
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
|
||||
#
|
||||
cd "$(dirname "$0")"
|
||||
#!/bin/bash
|
||||
|
||||
cd "$(dirname "$0")" || exit
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
HOST=internal-metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
echo +++ status
|
||||
(
|
||||
set -x
|
||||
pwd
|
||||
sudo docker ps --no-trunc --size
|
||||
sudo du -hs /var/lib/{influxdb,chronograf,grafana}
|
||||
df -h
|
||||
free -h
|
||||
uptime
|
||||
)
|
||||
# List of containers
|
||||
containers=("influxdb_internal" "chronograf_8889_internal" "chronograf_8888_internal" "grafana_internal")
|
||||
|
||||
# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again
|
||||
# Send a message to Discord
|
||||
send_discord_message() {
|
||||
local message="$1"
|
||||
curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
|
||||
}
|
||||
|
||||
for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do
|
||||
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then
|
||||
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in metrics-internal server"}' "$SLACK_WEBHOOK"
|
||||
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in metrics-internal server"}' "$DISCORD_WEBHOOK"
|
||||
echo "Starting up script"
|
||||
sudo bash $container.sh
|
||||
sleep 30
|
||||
fi
|
||||
done
|
||||
# Send a critical alert to PagerDuty
|
||||
send_pagerduty_alert() {
|
||||
local description="$1"
|
||||
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
|
||||
}
|
||||
|
||||
# Iterate over the containers and check their status
|
||||
for container in "${containers[@]}"; do
|
||||
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
|
||||
|
||||
if [ "$container_status" != "running" ]; then
|
||||
send_discord_message "$container is down and it's being redeployed..."
|
||||
|
||||
# Run the container.sh script to redeploy the container
|
||||
chmod +x "$container.sh"
|
||||
./"$container.sh"
|
||||
sleep 10
|
||||
|
||||
# Check the container status again
|
||||
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
|
||||
|
||||
if [ "$container_status" != "running" ]; then
|
||||
send_discord_message "$container failed to redeploy and manual intervention is required"
|
||||
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
|
||||
else
|
||||
send_discord_message "$container has been redeployed successfully"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
|
|
|
@ -132,7 +132,7 @@ sudo docker run \
|
|||
--volume /var/lib/chronograf:/var/lib/chronograf \
|
||||
--log-opt max-size=1g \
|
||||
--log-opt max-file=5 \
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://metrics.solana.com:8086
|
||||
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
|
||||
|
||||
sudo docker run \
|
||||
--detach \
|
||||
|
|
|
@ -1,5 +1,12 @@
|
|||
#!/bin/bash
|
||||
|
||||
cd "$(dirname "$0")" || exit
|
||||
|
||||
if [[ -z $HOST ]]; then
|
||||
HOST=metrics.solana.com
|
||||
fi
|
||||
echo "HOST: $HOST"
|
||||
|
||||
# List of containers
|
||||
containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")
|
||||
|
||||
|
|
Loading…
Reference in New Issue