update metrics status scripts (#31037)

* update metrics status scripts

* add exit condition
This commit is contained in:
joeaba 2023-04-04 09:03:57 -05:00 committed by GitHub
parent b0540ff5ad
commit 80b25726e1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 73 additions and 40 deletions

View File

@ -5,7 +5,7 @@
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
@ -34,7 +34,7 @@ sudo chown buildkite-agent:buildkite-agent certs
# (Re)start the container
sudo sudo docker run \
sudo docker run \
--detach \
--env AUTH_DURATION=24h \
--env TLS_CERTIFICATE=/certs/fullchain.pem \
@ -53,4 +53,4 @@ sudo sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"

View File

@ -5,7 +5,7 @@
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
@ -43,7 +43,7 @@ sudo docker run \
--env GOOGLE_CLIENT_SECRET="$GOOGLE_CLIENT_SECRET_8889" \
--env GOOGLE_DOMAINS=solana.com,jito.wtf,jumpcrypto.com,certus.one,mango.markets \
--env PUBLIC_URL=https://internal-metrics.solana.com:8889 \
--env TOKEN_SECRET= \
--env TOKEN_SECRET="$TOKEN_SECRET" \
--env inactivity-duration=48h \
--name=chronograf_8889_internal \
--net=influxdb \
@ -53,4 +53,4 @@ sudo docker run \
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"

View File

@ -2,10 +2,11 @@
#
# (Re)starts the Grafana containers
#
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
@ -32,7 +33,6 @@ sudo cp /etc/letsencrypt/live/"$HOST"/privkey.pem certs/
sudo chmod 0444 certs/*
sudo chown buildkite-agent:buildkite-agent certs
#(Re)start the container
sudo docker run \
--detach \
@ -41,6 +41,10 @@ sudo docker run \
--publish 3000:3000 \
--user root:root \
--env GF_PATHS_CONFIG=/grafana.ini \
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
--volume "$PWD"/certs:/certs:ro \
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
--volume /var/lib/grafana:/var/lib/grafana \

View File

@ -5,7 +5,7 @@
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
@ -39,6 +39,8 @@ sudo docker run \
--net=influxdb \
--publish 8086:8086 \
--user "$(id -u):$(id -g)" \
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
--volume "$PWD"/certs:/certs \
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
--volume /var/lib/influxdb:/var/lib/influxdb \

View File

@ -6,7 +6,7 @@
cd "$(dirname "$0")"
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
@ -48,6 +48,10 @@ sudo docker run \
--publish 3000:3000 \
--user root:root \
--env GF_PATHS_CONFIG=/grafana.ini \
--env GF_AUTH_GITHUB_CLIENT_ID="$GITHUB_CLIENT_ID" \
--env GF_AUTH_GITHUB_CLIENT_SECRET="$GITHUB_CLIENT_SECRET" \
--env GF_SECURITY_ADMIN_USER="$ADMIN_USER_GRAFANA" \
--env GF_SECURITY_ADMIN_PASSWORD="$ADMIN_PASSWORD_GRAFANA" \
--volume "$PWD"/certs:/certs:ro \
--volume "$PWD"/grafana-"$HOST".ini:/grafana.ini:ro \
--volume /var/lib/grafana:/var/lib/grafana \
@ -61,6 +65,8 @@ sudo docker run \
--net=influxdb \
--publish 8086:8086 \
--user "$(id -u):$(id -g)" \
--env INFLUXDB_ADMIN_USER="$INFLUXDB_USERNAME" \
--env INFLUXDB_ADMIN_PASSWORD="$INLUXDB_PASSWORD" \
--volume "$PWD"/certs:/certs \
--volume "$PWD"/influxdb.conf:/etc/influxdb/influxdb.conf:ro \
--volume /var/lib/influxdb:/var/lib/influxdb \
@ -90,9 +96,9 @@ sudo docker run \
--volume /var/lib/chronograf_8889:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
sudo sudo docker run \
sudo docker run \
--detach \
--env AUTH_DURATION=24h \
--env TLS_CERTIFICATE=/certs/fullchain.pem \
@ -111,7 +117,7 @@ sudo sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file="5" \
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086 --influxdb-username="$INFLUXDB_USERNAME" --influxdb-password="$INLUXDB_PASSWORD"
curl -h | sed -ne '/--tlsv/p'
curl --retry 10 --retry-delay 5 -v --head https://"$HOST":8086/ping

View File

@ -1,33 +1,47 @@
#!/bin/bash -ex
#
# Status of the InfluxDB/Chronograf/Grafana/Chronograf_8889 containers
#
cd "$(dirname "$0")"
#!/bin/bash
cd "$(dirname "$0")" || exit
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
HOST=internal-metrics.solana.com
fi
echo "HOST: $HOST"
echo +++ status
(
set -x
pwd
sudo docker ps --no-trunc --size
sudo du -hs /var/lib/{influxdb,chronograf,grafana}
df -h
free -h
uptime
)
# List of containers
containers=("influxdb_internal" "chronograf_8889_internal" "chronograf_8888_internal" "grafana_internal")
# If the container is not running state or exited state, then sent the notification on slack and redeploy the container again
# Send a message to Discord
send_discord_message() {
local message="$1"
curl -sS -H "Content-Type: application/json" -X POST -d "{\"content\": \"$message\"}" "$DISCORD_WEBHOOK"
}
for container in influxdb_internal chronograf_8888_internal chronograf_8889_internal grafana_internal; do
if [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" != "running" ] || [ "$(sudo docker inspect --format='{{.State.Status}}' $container)" = "exited" ]; then
curl -X POST -H 'Content-type: application/json' --data '{"text": "'"$container"' container is down in metrics-internal server"}' "$SLACK_WEBHOOK"
curl -X POST -H 'Content-type: application/json' --data '{"content": "'"$container"' container is down in metrics-internal server"}' "$DISCORD_WEBHOOK"
echo "Starting up script"
sudo bash $container.sh
sleep 30
fi
done
# Send a critical alert to PagerDuty
send_pagerduty_alert() {
local description="$1"
curl -sS -H "Content-Type: application/json" -X POST -d "{\"event_action\": \"trigger\", \"payload\": {\"summary\": \"$description\", \"source\": \"Docker Monitor\", \"severity\": \"critical\"}}" "$PAGERDUTY_WEBHOOK"
}
# Iterate over the containers and check their status
for container in "${containers[@]}"; do
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
if [ "$container_status" != "running" ]; then
send_discord_message "$container is down and it's being redeployed..."
# Run the container.sh script to redeploy the container
chmod +x "$container.sh"
./"$container.sh"
sleep 10
# Check the container status again
container_status=$(docker inspect --format '{{.State.Status}}' "$container" 2>/dev/null)
if [ "$container_status" != "running" ]; then
send_discord_message "$container failed to redeploy and manual intervention is required"
send_pagerduty_alert "$container failed to redeploy and manual intervention is required."
else
send_discord_message "$container has been redeployed successfully"
fi
fi
done

View File

@ -132,7 +132,7 @@ sudo docker run \
--volume /var/lib/chronograf:/var/lib/chronograf \
--log-opt max-size=1g \
--log-opt max-file=5 \
$CHRONOGRAF_IMAGE --influxdb-url=https://metrics.solana.com:8086
$CHRONOGRAF_IMAGE --influxdb-url=https://"$HOST":8086
sudo docker run \
--detach \

View File

@ -1,5 +1,12 @@
#!/bin/bash
cd "$(dirname "$0")" || exit
if [[ -z $HOST ]]; then
HOST=metrics.solana.com
fi
echo "HOST: $HOST"
# List of containers
containers=("chronograf_8889" "grafana" "alertmanager" "alertmanager-discord" "prometheus" "chronograf" "kapacitor")